calibration.json (20041B)
1 { 2 "calibration": { 3 "paper_slug": "agentvigil-generic-blackbox-2025", 4 "date": "2026-02-28", 5 "sonnet_scan_date": "2026-02-28", 6 "model": "opus", 7 "agreement_rate": 0.98, 8 "total_questions": 50, 9 "agreements": 49, 10 "disagreements": 1, 11 "disagreement_details": [ 12 { 13 "category": "statistical_methodology", 14 "question": "effect_sizes_reported", 15 "sonnet": {"applies": true, "answer": false}, 16 "opus": {"applies": true, "answer": true}, 17 "direction": "opus_generous", 18 "opus_justification": "The paper reports '71% success rate' vs '38% baseline' and states 'nearly a 100% improvement over the baseline attacks' (Section 5.1). This provides percentage improvement with baseline context, which the schema description explicitly lists as sufficient for YES: 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' The paper matches this pattern across both benchmarks.", 19 "sonnet_justification": "While the paper reports raw performance numbers, it does not report formal effect sizes (Cohen's d, odds ratios, etc.). The claim of 'nearly doubling performance' is contextually described but not accompanied by a formal effect size measure.", 20 "resolution_note": "Borderline case. The schema description lists 'percentage improvement with baseline context' as an acceptable form of effect size alongside Cohen's d and odds ratios. The paper provides relative improvement ('nearly 100% improvement') with explicit baseline and result numbers. Sonnet applies a stricter interpretation requiring formal statistical effect sizes. Opus reads the schema's YES example literally. Reasonable people could disagree." 21 } 22 ], 23 "summary": "Very high agreement (98%) between Sonnet and Opus on this paper. The single disagreement is on effect_sizes_reported, where Opus reads the schema's example of 'percentage improvement with baseline context' as matching the paper's reporting pattern, while Sonnet applies a stricter standard requiring formal effect size measures like Cohen's d. Both evaluations otherwise align on all 49 remaining questions, including the large number of false answers for statistical methodology, the strong evaluation design scores, and the complete absence of limitations discussion.", 24 "opus_checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. Algorithms are described in pseudocode (Appendix C) but no implementation is released." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "The paper evaluates on two public benchmarks: AgentDojo (Debenedetti et al., 2024) and VWA-adv (Wu et al., 2024b), both publicly available. The real-world case study uses open-source WebArena/magento2. No novel dataset was created that requires separate release." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "No requirements.txt, Dockerfile, conda environment, or dependency specification is provided. Appendix A lists model checkpoints but not the software environment needed to run experiments." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "No step-by-step reproduction instructions, README, or reproduction scripts are provided. Algorithms 1-3 in Appendix C describe logic in pseudocode but are not runnable instructions." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "All results in Tables 1-5 and Figures 3-4 are reported as point estimates (e.g., '71% success rate'). No confidence intervals, error bars, or ± notation appears anywhere in the paper." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "No statistical significance tests are used. Comparative claims ('nearly doubling performance', 'outperforms the baseline') are made by directly comparing point-estimate success rates without p-values, t-tests, or any formal testing." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "The paper reports percentage improvement with baseline context: '71% success rate' vs '38% baseline' with 'nearly a 100% improvement over the baseline attacks' (Section 5.1). Similar reporting for VWA-adv (70% vs 36%). Per the schema description, 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' The paper matches this pattern." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "The paper uses 142/173 task splits for AgentDojo and 99/100 for VWA-adv. The splits are described as 'randomly dividing' but no justification for these sizes or power analysis is provided." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "All results are single-run point estimates. No standard deviation, variance, IQR, or any spread measure is reported across experimental runs. Given the stochastic nature of fuzzing (random seed sampling, random mutation selection), variance across runs could be substantial." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "Handcrafted adversarial prompts from AgentDojo and VWA-adv serve as baselines (Sections 5.1, 5.2). Additional baselines from OpenPromptInjection and InjecAgent are included in Appendix B.2 (Table 4)." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "All baselines are from 2024: AgentDojo (2024), VWA-adv (2024), OpenPromptInjection (2024), InjecAgent (2024). These represent the current state of the art in indirect prompt injection benchmarking." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": true, 88 "justification": "Section 5.3 presents an ablation study isolating contributions of three components: (1) initial corpus, (2) adaptive seed scoring, and (3) MCTS-based seed selection. Results shown in Figure 3." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "The paper reports attack success rate (ASR) as the primary metric and coverage as an additional metric (Figures 3 and 4). Utility scores for open-source models are also reported in Appendix B.1." 94 }, 95 "human_evaluation": { 96 "applies": false, 97 "answer": false, 98 "justification": "This is an adversarial security paper evaluating automated attack success rates on benchmarks with programmatic success criteria. Human evaluation of outputs is not relevant to the core claims about attack effectiveness." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper explicitly separates tasks into fuzzing sets (used for optimization) and test sets (held-out for evaluation): 142/173 for AgentDojo and 99/100 for VWA-adv. Transferability is evaluated on the test set." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Appendix B.3, Table 5 provides per-scenario breakdowns: AgentDojo by suite (Slack, Workspace, Travel, Banking) and VWA-adv by attack goal (Illusioning, Goal misdirection). Per-model results appear throughout Table 1." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper discusses failure against Claude-3.5-Sonnet (Section 5.1: 'both the baseline and AGENTVIGIL's prompts are ineffective against Claude-3.5-Sonnet'). Performance degradation against defenses is discussed in Sections 5.1 and 5.2." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "The paper reports that AGENTVIGIL's optimized prompts do not transfer to Claude-3.5-Sonnet (0.03-0.04 ASR, worse than baseline's 0.08-0.12). It also reports that in VWA-adv with defenses, 'AGENTVIGIL's performance declines and converges with the baseline' (Section 5.2)." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "Abstract claims of 71% and 70% success rates are supported by Tables 1-3. 'Nearly doubling' the baseline (38% to 71%, 36% to 70%) is accurate. Transferability claims are supported by test set results in Table 1. 'Promising results against defenses' is a hedged claim supported by Tables 2-3." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "The ablation study (Section 5.3) uses controlled single-variable manipulation to support causal claims about which components contribute to performance. Replacing the initial corpus while keeping other components constant, and removing MCTS/scoring while keeping other components, constitutes adequate ablation design." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": false, 135 "justification": "The title claims 'Generic Black-Box Red-teaming' and the abstract claims applicability to 'diverse LLM agents', but testing is limited to two benchmarks (personal assistants in AgentDojo, web agents in VWA-adv) plus one real-world case study. Only English-language text-based injection is tested. Only specific types of injection goals are covered. The broad claims outstrip the tested scope." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "No alternative explanations for results are discussed. The speculation about Claude's resistance ('Claude is more vulnerable to simpler adversarial prompts') is presented without evidence or alternative considerations. No threats-to-validity section addresses confounds or alternative interpretations." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": true, 147 "justification": "Appendix A explicitly lists all model checkpoints with version IDs: o3-mini (o3-mini-2024-12-17), GPT-4o-mini (gpt-4o-mini-2024-07-18), GPT-4o (gpt-4o-2024-08-06), Claude-3.5-Sonnet (claude-3-5-sonnet-20241022), Gemini-2-flash-exp (gemini-2.0-flash-exp)." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": false, 152 "justification": "The adversarial prompt templates use 'placeholders to accommodate different variables' (Section 4.2) but the actual template text is not provided. Mutation methods (Shorten, Expand, Rephrase, Crossover, GenerateSimilar) are described in natural language only (Section 4.3) without the actual prompt text sent to the helper LLM." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": false, 157 "justification": "LLM inference hyperparameters (temperature, top-p, max tokens) are not reported for any model. Fuzzing parameters (10 iterations, 3 or 10 mutations per iteration, top-5 seeds for transferability) are stated, but LLM API parameters are absent." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "AGENTVIGIL attacks third-party agents (AgentDojo's agent, VWA-adv's agent, WebArena's default agent) that are black-box systems. The authors cannot describe the internal scaffolding of these external agents. AGENTVIGIL itself is an attack framework, not an agentic scaffold." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": false, 167 "justification": "The initial corpus collection is described qualitatively as being 'from human heuristics, online resources, existing prompt injection research' (Section 4.2) with two citations. The number of initial templates, specific filtering criteria, and exact sources are not documented in sufficient detail to reproduce the corpus." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": false, 174 "justification": "There is an 'Impact Statement' section before References that discusses dual-use concerns, but no dedicated Limitations or Threats to Validity section exists. The Impact Statement does not discuss methodological limitations of the evaluation." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": false, 179 "justification": "No threats-to-validity section exists. The Impact Statement contains generic statements about ongoing research but does not address specific threats such as benchmark selection bias, limited number of injection task types, stochastic evaluation via random task sampling, or model version sensitivity." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "The threat model (Section 3) defines attacker capabilities and exclusions, but this is not the same as bounding the results. The paper does not explicitly state what the results do NOT show, such as whether the approach would generalize to non-English agents, non-text modalities, or agent architectures beyond the two tested benchmarks." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "Generated adversarial prompts and raw per-task attack success logs are not released. Only aggregate success rates appear in tables. Independent verification of underlying results is not possible." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "The primary data comes from two public benchmarks. Section 4.2 describes corpus collection from 'human heuristics, online resources, existing prompt injection research' with specific citations. Benchmark task splits are described with counts (142/173 for AgentDojo, 99/100 for VWA-adv). Random division procedure is stated." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. All evaluation uses automated benchmarks with pre-defined tasks." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The fuzzing pipeline is documented through Algorithms 1-3 (Appendix C) and Figure 2. The flow from corpus collection to seed selection to mutation to scoring to iteration is clearly described with pseudocode." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding source, acknowledgments section, or grant information appears in the paper. As a preprint from three universities (UC Berkeley, Washington University, UC Santa Barbara), research funding would be expected but is not disclosed." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly listed on the first page: UC Berkeley (1), Washington University, Saint Louis (2), and UC Santa Barbara (3), with each author's affiliation number indicated." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "No funding is disclosed, making it impossible to assess funder independence. Per schema conventions, absence of disclosure is treated as NO." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests statement, patent disclosure, or financial interests declaration appears in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "The paper specifies model checkpoints (Appendix A) but does not state training data cutoff dates for any of the LLMs used (o3-mini, GPT-4o, Claude-3.5-Sonnet, Gemini-2-flash-exp). This matters because benchmark tasks could appear in training data." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": false, 240 "justification": "No discussion of whether AgentDojo or VWA-adv benchmark tasks appeared in the training data of the evaluated LLMs. Both benchmarks are from 2024 and some models' training data may include descriptions of these benchmarks." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": false, 245 "justification": "AgentDojo (arXiv:2406.13352, June 2024) and VWA-adv (2024) were published before some model checkpoints. GPT-4o (gpt-4o-2024-08-06) may have trained on data including these benchmark descriptions. This contamination risk is not discussed." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. All evaluation is automated using benchmark frameworks." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants. The paper discusses ethical considerations for the real-world case study (using a local copy) but this is not an IRB-reviewed human subjects study." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "Extensive LLM API usage (GPT-4o, o3-mini, GPT-4o-mini for helper, multiple models for evaluation, across hundreds of tasks and 10 fuzzing iterations) but no API costs, token counts, or latency figures are reported. The paper mentions excluding some defenses due to 'high computational costs' without quantifying its own costs." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "No total computational budget is stated. The paper describes 10 fuzzing iterations with 3 or 10 mutations per iteration across 142+ tasks, implying substantial API usage, but no total cost, GPU hours, or compute budget is reported." 295 } 296 } 297 } 298 } 299 }