calibration.json (18367B)
1 { 2 "calibration_metadata": { 3 "paper_slug": "agenttypo-adaptive-typographic-2025", 4 "calibration_date": "2026-02-28", 5 "sonnet_scan_date": "2026-02-28", 6 "calibrator_model": "opus", 7 "agreement_rate": 0.96, 8 "total_questions": 50, 9 "agreements": 48, 10 "disagreements": 2 11 }, 12 "disagreements": [ 13 { 14 "category": "evaluation_design", 15 "question": "held_out_test_set", 16 "sonnet": {"applies": true, "answer": true}, 17 "opus": {"applies": true, "answer": false}, 18 "direction": "sonnet_generous", 19 "explanation": "Sonnet says 'there is no indication that tasks were used for tuning — hyperparameter selection appears to be done separately (Section VI.G explores lambda and step budget).' However, Section VI.G explicitly tunes hyperparameters (lambda, step budget, number of retrieved examples) by measuring ASR on the same 77 VWA-Adv tasks shown in Figures 6, 7, and 8. There is no mention of a separate dev split for tuning vs. a held-out split for final evaluation. The hyperparameters were selected based on performance on the same data that the main results are reported on. Per the schema: 'If unclear whether the reported numbers are on data used for any selection decisions, NO.'" 20 }, 21 { 22 "category": "conflicts_of_interest", 23 "question": "funder_independent_of_outcome", 24 "sonnet": {"applies": false, "answer": false}, 25 "opus": {"applies": true, "answer": false}, 26 "direction": "applies_boundary", 27 "explanation": "Sonnet sets applies=false, reasoning that no funder is identifiable so NA applies. Opus sets applies=true because the paper does not explicitly state it is unfunded — the schema says 'NA if unfunded' but absence of funding disclosure is not the same as confirmed absence of funding. Since the paper could have disclosed funding (or its absence) and did not, the criterion is applicable and the answer is false. However, this is a genuine applies-boundary case since the schema does provide NA for unfunded work." 28 } 29 ], 30 "opus_checklist": { 31 "artifacts": { 32 "code_released": { 33 "applies": true, 34 "answer": false, 35 "justification": "No GitHub repository URL, Zenodo archive, or any code link is provided anywhere in the paper. The ATPI algorithm, optuna-based TPE implementation, and multi-LLM scaffolding are described abstractly but no code is released." 36 }, 37 "data_released": { 38 "applies": true, 39 "answer": true, 40 "justification": "The paper uses the publicly available VWA-Adv benchmark (referenced with links to VisualWebArena). No proprietary data was collected — all experiments use this existing public benchmark of 77 tasks across three domains." 41 }, 42 "environment_specified": { 43 "applies": true, 44 "answer": false, 45 "justification": "Only 'eight NVIDIA RTX 3090 GPUs' and the 'optuna' library are mentioned. No requirements.txt, Dockerfile, conda environment, or detailed list of library versions is provided. This is insufficient to recreate the environment." 46 }, 47 "reproduction_instructions": { 48 "applies": true, 49 "answer": false, 50 "justification": "Algorithm 1 provides pseudocode for ATPI, but no step-by-step reproduction instructions, README, or runnable scripts are included. A competent researcher could not reproduce results without substantial implementation effort and guesswork." 51 } 52 }, 53 "statistical_methodology": { 54 "confidence_intervals_or_error_bars": { 55 "applies": true, 56 "answer": false, 57 "justification": "All results in Tables III and IV are point estimates (e.g., '0.45', '0.68'). No confidence intervals, error bars, or uncertainty measures are reported despite three trials per task being run." 58 }, 59 "significance_tests": { 60 "applies": true, 61 "answer": false, 62 "justification": "The paper claims 'AgentTypo significantly outperforms' baselines repeatedly but uses no statistical significance tests — no p-values, t-tests, bootstrap tests, or any other formal test." 63 }, 64 "effect_sizes_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "Only raw ASR values are reported (e.g., 0.23 vs. 0.45). While both baseline and improved numbers are given, no standardized effect sizes (Cohen's d, odds ratios, relative risk) are provided. The schema asks for more than raw differences." 68 }, 69 "sample_size_justified": { 70 "applies": true, 71 "answer": false, 72 "justification": "The VWA-Adv benchmark has 77 tasks. No justification is given for whether this sample size provides adequate statistical power for the comparative claims. No power analysis is discussed." 73 }, 74 "variance_reported": { 75 "applies": true, 76 "answer": false, 77 "justification": "Section VI.A states 'each prompt is executed with three independent trials' but only aggregate ASR is reported. No standard deviation, variance, interquartile range, or any spread measure across trials is provided." 78 } 79 }, 80 "evaluation_design": { 81 "baselines_included": { 82 "applies": true, 83 "answer": true, 84 "justification": "Four baselines are compared: Raw Prompt Injection, InjecAgent, AdvAgent, and AgentAttack, covering both text-based and image-based attack methods." 85 }, 86 "baselines_contemporary": { 87 "applies": true, 88 "answer": true, 89 "justification": "Baselines include AgentAttack (ICLR 2025), AdvAgent (ICML 2025), and InjecAgent (ACL 2024), all contemporary to the October 2025 submission date and representing current state of the art." 90 }, 91 "ablation_study": { 92 "applies": true, 93 "answer": true, 94 "justification": "Section VI.E (Table IV) presents ablation across four configurations: ATPI alone, Strategy Library alone, Strategy+RAG, and full AgentTypo-pro, clearly isolating each component's contribution." 95 }, 96 "multiple_metrics": { 97 "applies": true, 98 "answer": false, 99 "justification": "Only Attack Success Rate (ASR) is used as the evaluation metric in the main results tables. LPIPS stealthiness is shown only in the hyperparameter sweep (Figure 6), not as a standard comparison metric across all methods. No secondary metrics like query cost, latency, or defense evasion rate are used." 100 }, 101 "human_evaluation": { 102 "applies": true, 103 "answer": false, 104 "justification": "Manual human review is mentioned only to validate the LLM-based scorer's accuracy, not as a systematic evaluation of attack outputs, agent behavior, or visual stealthiness. No human study is conducted." 105 }, 106 "held_out_test_set": { 107 "applies": true, 108 "answer": false, 109 "justification": "Section VI.G tunes hyperparameters (lambda, step budget, number of retrieved examples) by measuring ASR on the same 77 VWA-Adv tasks (Figures 6, 7, 8). No separate dev/test split is mentioned. The reported numbers are on data used for hyperparameter selection decisions." 110 }, 111 "per_category_breakdown": { 112 "applies": true, 113 "answer": true, 114 "justification": "Table III provides per-domain breakdowns (Classifieds, Shopping, Reddit) and per-model breakdowns (GPT-4V, GPT-4o, GPT-4o-mini, Gemini-1.5-Pro, Claude-3-Opus) for both image+text and image-only settings." 115 }, 116 "failure_cases_discussed": { 117 "applies": true, 118 "answer": true, 119 "justification": "Section VIII discusses the stealth-effectiveness trade-off limitation. Section VI.D discusses why AgentAttack fails on complex tasks (0% ASR on email). The paper acknowledges where its method requires conspicuous text." 120 }, 121 "negative_results_reported": { 122 "applies": true, 123 "answer": true, 124 "justification": "Table IV shows Strategy Library alone yields lower ASR than the full system. Section VI.G shows ASR degrades when retrieving more than 5 examples. These are genuine negative findings about component configurations." 125 } 126 }, 127 "claims_and_evidence": { 128 "abstract_claims_supported": { 129 "applies": true, 130 "answer": true, 131 "justification": "The abstract claims 'raises the success rate from 23% to 45%' on GPT-4o image-only, and '68% ASR in image+text settings.' Table III confirms both numbers for GPT-4o. Quantitative claims in the abstract match the results." 132 }, 133 "causal_claims_justified": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper makes causal claims through ablation (e.g., 'Incorporating the Strategy Library shows a substantial improvement in ASR'). While the ablation design controls single variables, the absence of statistical tests over only 77 tasks means observed differences may not be distinguishable from noise. The causal claims are not adequately supported." 137 }, 138 "generalization_bounded": { 139 "applies": true, 140 "answer": false, 141 "justification": "The abstract and conclusion claim AgentTypo 'poses a practical and potent threat to multimodal agents' without bounding to the tested setting of three simulated websites from VWA-Adv. The title says 'Black-box Multimodal Agents' generically." 142 }, 143 "alternative_explanations_discussed": { 144 "applies": true, 145 "answer": false, 146 "justification": "No alternative explanations are considered for the observed improvements. The paper does not discuss whether improvements over AgentAttack could be due to specific task characteristics, benchmark artifacts, or other confounds." 147 } 148 }, 149 "setup_transparency": { 150 "model_versions_specified": { 151 "applies": true, 152 "answer": false, 153 "justification": "Models are cited as 'GPT-4V', 'GPT-4o', 'GPT-4o-mini', 'Gemini-1.5-Pro', and 'Claude-3-Opus' without snapshot dates or API version strings. The attacker/scorer/summarizer LLMs are described only as 'GPT-4' without version. Per the schema, marketing names without snapshot dates do not count." 154 }, 155 "prompts_provided": { 156 "applies": true, 157 "answer": false, 158 "justification": "System prompts for the Attacker LLM, Scorer LLM, and Summarizer LLM are described in natural language only (e.g., 'formulates a hijacking prompt according to a specified adversarial goal'). Table V shows example output prompts, not the system prompts used to generate them. The actual prompt text sent to these LLMs is not provided." 159 }, 160 "hyperparameters_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Section VI.C reports: stealthiness weight lambda=10.0, maximum 20 optimization steps, top-k=5 retrieval examples, success threshold=0.8. Table I lists ATPI parameter search ranges (font size 10-150, color 0-255, position, etc.)." 164 }, 165 "scaffolding_described": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section V describes AgentTypo-pro's multi-LLM scaffolding in detail: Attacker LLM (Eq. 7), Scorer LLM (Eq. 8), RAG module (Eq. 9), Summarizer LLM (Eq. 10), with Figure 4 showing the full pipeline, interaction logic, and feedback loops." 169 }, 170 "data_preprocessing_documented": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section III.A and VI.A describe the data setup: JavaScript annotation of interactable elements, SoM parsing, captioning by the same LVLM. The VWA-Adv benchmark's 77 tasks across three websites are described." 174 } 175 }, 176 "limitations_and_scope": { 177 "limitations_section_present": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section VIII 'Limitations' is a dedicated section discussing the stealth-effectiveness trade-off and restricted benchmark scope (three websites)." 181 }, 182 "threats_to_validity_specific": { 183 "applies": true, 184 "answer": false, 185 "justification": "The two limitations discussed (stealth trade-off and limited benchmark) are partially specific to this study, but the section does not address key validity threats: the 77-task sample size, the reliability and potential bias of the LLM-based scorer, hyperparameter tuning on the test set, or whether results depend on specific VWA-Adv task characteristics." 186 }, 187 "scope_boundaries_stated": { 188 "applies": true, 189 "answer": false, 190 "justification": "No explicit statements about what the results do NOT show. The benchmark limitation is acknowledged but no specific claims are retracted or bounded. No statement like 'these results do not imply effectiveness against text-only agents or real-world web deployments.'" 191 } 192 }, 193 "data_integrity": { 194 "raw_data_available": { 195 "applies": true, 196 "answer": false, 197 "justification": "Raw experimental data (individual task outcomes, LLM scorer responses, agent action logs, generated adversarial images) are not released. Only aggregate ASR tables are published." 198 }, 199 "data_collection_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "The VWA-Adv benchmark is a described public dataset. Section VI.A documents its composition: 77 adversarial tasks across Classifieds, Shopping, and Reddit domains, built on VisualWebArena." 203 }, 204 "recruitment_methods_described": { 205 "applies": false, 206 "answer": false, 207 "justification": "No human participants are involved. The study is a purely automated benchmark evaluation with LLM-based agents. Recruitment methods are not applicable." 208 }, 209 "data_pipeline_documented": { 210 "applies": true, 211 "answer": true, 212 "justification": "The pipeline is documented: task selection from VWA-Adv → ATPI image modification (Algorithm 1) → agent execution with 3 trials → LLM-based scoring → ASR calculation. Sections IV-VI describe each stage." 213 } 214 }, 215 "conflicts_of_interest": { 216 "funding_disclosed": { 217 "applies": true, 218 "answer": false, 219 "justification": "No acknowledgments section and no mention of funding sources anywhere in the paper." 220 }, 221 "affiliations_disclosed": { 222 "applies": true, 223 "answer": true, 224 "justification": "All authors (Yanjie Li, Yiming Cao, Dong Wang, Bin Xiao) are identified as affiliated with the Computing Department, Hong Kong Polytechnic University, listed in the author footnote on page 1." 225 }, 226 "funder_independent_of_outcome": { 227 "applies": true, 228 "answer": false, 229 "justification": "No funding is disclosed. The paper does not state it is unfunded, so we cannot confirm the 'NA if unfunded' exception applies. Absence of funding disclosure means independence cannot be verified." 230 }, 231 "financial_interests_declared": { 232 "applies": true, 233 "answer": false, 234 "justification": "No competing interests statement is present anywhere in the paper. Per the schema, absence of disclosure is not the same as absence of conflict." 235 } 236 }, 237 "contamination": { 238 "training_cutoff_stated": { 239 "applies": true, 240 "answer": false, 241 "justification": "The paper uses GPT-4V, GPT-4o, GPT-4o-mini, Gemini-1.5-Pro, and Claude-3-Opus as target agents but does not state training data cutoff dates for any model. This matters for assessing whether models could have seen VWA-Adv/VisualWebArena tasks during training." 242 }, 243 "train_test_overlap_discussed": { 244 "applies": true, 245 "answer": false, 246 "justification": "No discussion of whether VWA-Adv benchmark tasks or VisualWebArena data could appear in the training data of the evaluated LVLMs. VisualWebArena was published in 2024 and models trained after that could have seen it." 247 }, 248 "benchmark_contamination_addressed": { 249 "applies": true, 250 "answer": false, 251 "justification": "VWA-Adv is built on VisualWebArena (2024). Several evaluated models may have training data including this benchmark. The paper does not discuss contamination risk at all." 252 } 253 }, 254 "human_studies": { 255 "pre_registered": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved. This is a purely automated benchmark evaluation study." 259 }, 260 "irb_or_ethics_approval": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved. IRB approval is not applicable." 264 }, 265 "demographics_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved. Demographics are not applicable." 269 }, 270 "inclusion_exclusion_criteria": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants are involved. Inclusion/exclusion criteria are not applicable." 274 }, 275 "randomization_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants are involved. Randomization of participant assignment is not applicable." 279 }, 280 "blinding_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants are involved. Blinding is not applicable." 284 }, 285 "attrition_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants are involved. Attrition is not applicable." 289 } 290 }, 291 "cost_and_practicality": { 292 "inference_cost_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "The paper calls GPT-4o, GPT-4V, and other commercial APIs extensively (3 trials × 77 tasks × up to 20 optimization steps per task × multiple models) but provides no cost estimates, token counts, or API expenditure figures." 296 }, 297 "compute_budget_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "Only 'eight NVIDIA RTX 3090 GPUs' is mentioned for running local models (ATPI captioners). Total wall-clock time, GPU hours, or API budget for the full experimental suite are not reported." 301 } 302 } 303 } 304 }