calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (18806B)
      1 {
      2   "calibration": {
      3     "paper_slug": "agent-security-bench-2024",
      4     "sonnet_scan_date": "2026-02-28",
      5     "opus_calibration_date": "2026-02-28",
      6     "total_questions": 50,
      7     "total_fields": 100,
      8     "agreements": 99,
      9     "disagreements": 1,
     10     "agreement_rate": 0.990,
     11     "disagreement_details": [
     12       {
     13         "category": "conflicts_of_interest",
     14         "question": "funder_independent_of_outcome",
     15         "sonnet": {
     16           "applies": false,
     17           "answer": false,
     18           "justification": "No funding source is disclosed, so this criterion cannot be evaluated."
     19         },
     20         "opus": {
     21           "applies": true,
     22           "answer": false,
     23           "justification": "No funding source is disclosed. The schema says 'NA if unfunded,' but the absence of a funding disclosure does not confirm the work is unfunded -- it may simply be undisclosed. Since we cannot confirm unfunded status, applies=true and answer=false (independence cannot be verified)."
     24         },
     25         "direction": "applies_boundary",
     26         "explanation": "Sonnet treats missing funding disclosure as confirming unfunded status (applies=false). Opus treats it as unknown funding status, which means the criterion applies but cannot be satisfied. The schema says 'NA if unfunded' but does not address the case where funding status is itself unknown."
     27       }
     28     ],
     29     "opus_checklist": {
     30       "artifacts": {
     31         "code_released": {
     32           "applies": true,
     33           "answer": true,
     34           "justification": "The abstract states 'Our code can be found at https://github.com/agiresearch/ASB.' Appendix F (Reproducibility Statement) confirms source code availability including attack scripts, configuration files, and Docker setup."
     35         },
     36         "data_released": {
     37           "applies": true,
     38           "answer": true,
     39           "justification": "Benchmark data (tool definitions in JSONL format) is included in the GitHub repository: data/all_normal_tools.jsonl and data/all_attack_tools.jsonl, referenced in Appendix C.2.6."
     40         },
     41         "environment_specified": {
     42           "applies": true,
     43           "answer": true,
     44           "justification": "Appendix F states that environment setup is provided via requirements.txt for GPU/non-GPU systems, with Conda and Docker installation instructions included in the repository."
     45         },
     46         "reproduction_instructions": {
     47           "applies": true,
     48           "answer": true,
     49           "justification": "Appendix F describes predefined attack scripts (e.g., scripts/agent_attack.py), YAML configuration files in config/, and step-by-step instructions for setting up API keys and running experiments."
     50         }
     51       },
     52       "statistical_methodology": {
     53         "confidence_intervals_or_error_bars": {
     54           "applies": true,
     55           "answer": false,
     56           "justification": "All results in Tables 5-8, 14-21 are reported as point estimate percentages only. No confidence intervals, error bars, or uncertainty measures appear anywhere in the paper."
     57         },
     58         "significance_tests": {
     59           "applies": true,
     60           "answer": false,
     61           "justification": "The paper makes numerous comparative claims (e.g., 'Mixed Attack is the most impactful', 'DPI is more effective than IPI', Section 5.3 and D.1.1) based solely on comparing raw percentages with no statistical significance tests."
     62         },
     63         "effect_sizes_reported": {
     64           "applies": true,
     65           "answer": false,
     66           "justification": "No standardized effect size measures are reported. Results are raw ASR percentages only, with no Cohen's d, odds ratios, or contextualized effect magnitudes."
     67         },
     68         "sample_size_justified": {
     69           "applies": true,
     70           "answer": false,
     71           "justification": "The benchmark uses 50 agent tasks and 400 attack tasks (Table 3), but no justification is provided for why these quantities were chosen or whether they provide sufficient statistical power for the comparisons made."
     72         },
     73         "variance_reported": {
     74           "applies": true,
     75           "answer": false,
     76           "justification": "No variance, standard deviation, or spread measures are reported across runs. All results appear to be single-run numbers with no indication of result stability across repetitions."
     77         }
     78       },
     79       "evaluation_design": {
     80         "baselines_included": {
     81           "applies": true,
     82           "answer": true,
     83           "justification": "Table 12 compares ASB against InjecAgent and AgentDojo on attack types, defense types, scenarios, tools, and test cases. Attack and defense results are compared across 13 LLM backbones."
     84         },
     85         "baselines_contemporary": {
     86           "applies": true,
     87           "answer": true,
     88           "justification": "The two main comparison benchmarks (InjecAgent, Zhan et al. 2024; AgentDojo, Debenedetti et al. 2024) are contemporary 2024 works. Defense methods used are from 2022-2024."
     89         },
     90         "ablation_study": {
     91           "applies": true,
     92           "answer": true,
     93           "justification": "Appendix D.1.2 systematically analyzes different attack combinations (DPI+IPI, DPI+MP, IPI+MP, full mixed) in Table 14. Appendix D.1.3 tests five different backdoor triggers in Table 15. These function as ablations of attack components."
     94         },
     95         "multiple_metrics": {
     96           "applies": true,
     97           "answer": true,
     98           "justification": "Seven distinct evaluation metrics are defined in Table 4 and used throughout: ASR, Refuse Rate, PNA, BP, FNR, FPR, and the novel NRP metric."
     99         },
    100         "human_evaluation": {
    101           "applies": false,
    102           "answer": false,
    103           "justification": "The evaluation is entirely automated via simulated tool invocation checking. Human evaluation is irrelevant for assessing whether an agent called a specific attack tool."
    104         },
    105         "held_out_test_set": {
    106           "applies": true,
    107           "answer": true,
    108           "justification": "For PoT backdoor attacks, Appendix C.2.2 states testing tasks are 'different from those in the PoT demonstration to ensure the independence and diversity of the experimental results.' No tuning on evaluation data is described for other attacks."
    109         },
    110         "per_category_breakdown": {
    111           "applies": true,
    112           "answer": true,
    113           "justification": "Results are broken down by attack type (DPI, IPI, Memory Poisoning, PoT Backdoor, Mixed), by LLM backbone (Tables 5-8, 14-21), by aggressive vs. non-aggressive tasks (Table 18), and by prompt injection type (Table 17)."
    114         },
    115         "failure_cases_discussed": {
    116           "applies": true,
    117           "answer": true,
    118           "justification": "Appendix D.2 provides detailed analysis of why each defense fails: D.2.1 for DPI/IPI defenses (delimiters, paraphrasing, instructional prevention, sandwich prevention), D.2.2 for PoT defenses, D.2.3 for memory attack defenses."
    119         },
    120         "negative_results_reported": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "The core finding is that current defenses are largely ineffective (Section 5.4, Tables 7-8, Appendix D.2). The paper explicitly reports that defenses reduce ASR only modestly and often cause utility losses."
    124         }
    125       },
    126       "claims_and_evidence": {
    127         "abstract_claims_supported": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Abstract claims (highest average ASR 84.30%, limited defense effectiveness, 10 scenarios, 400+ tools, 27 attack/defense methods, 7 metrics, 13 LLM backbones) are all directly supported by Tables 3 and 5."
    131         },
    132         "causal_claims_justified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Section 5.3 states 'Better agents with stronger backbone LLMs initially exhibit higher ASR due to their superior ability to follow instructions' -- a causal claim based solely on correlational patterns in Figure 2 with no controlled experiments isolating this mechanism."
    136         },
    137         "generalization_bounded": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "The title and abstract present findings about 'Attacks and Defenses in LLM-Based Agents' broadly, but all evaluations use synthetic/simulated task environments with AI-generated tools and tasks. The simulation-only evaluation setting is not acknowledged as a boundary on generalization."
    141         },
    142         "alternative_explanations_discussed": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "No threats-to-validity or limitations section exists. The paper does not consider alternative explanations for results, such as whether the synthetic benchmark tasks and AI-generated tools reflect realistic attack conditions, or whether simulated tool calls bias results."
    146         }
    147       },
    148       "setup_transparency": {
    149         "model_versions_specified": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "All 13 models are identified by marketing name only (e.g., 'Claude-3.5 Sonnet', 'GPT-4o', 'GPT-3.5 Turbo') in Table 13 and throughout the paper. No snapshot dates or API version strings (e.g., gpt-4o-2024-08-06) are provided."
    153         },
    154         "prompts_provided": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Appendix C.2.4 provides full actual prompt text for: system prompt for LLM-based agents, paraphrasing defense, instructional prevention, sandwich prevention, dynamic prompt rewriting, refusal judgment, and PoT backdoor trigger generation."
    158         },
    159         "hyperparameters_reported": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported anywhere in the paper. These settings significantly affect model output behavior and are necessary for reproducibility."
    163         },
    164         "scaffolding_described": {
    165           "applies": true,
    166           "answer": true,
    167           "justification": "The ReAct agent framework is described in Appendix A.2 with formal notation. The AIOS-based implementation details including Langchain integration, Chroma vector database, and tool simulation are documented in Appendix C.2.2 and C.2.6."
    168         },
    169         "data_preprocessing_documented": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Appendix B describes in detail how all benchmark data was generated using GPT-4: agent descriptions, user tasks, standard plans, tool definitions (normal and attack). Tool generation process with field descriptions is in B.2.1 and B.2.2. Examples in Tables 9-11."
    173         }
    174       },
    175       "limitations_and_scope": {
    176         "limitations_section_present": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "There is no dedicated limitations or threats-to-validity section. The paper has an ethics statement (Appendix E) and a reproducibility statement (Appendix F), but no limitations section."
    180         },
    181         "threats_to_validity_specific": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "No threats-to-validity discussion exists. Specific concerns such as the use of simulated (not real) tool calls, AI-generated tasks, or the validity of ASR as a proxy for real-world harm are not addressed."
    185         },
    186         "scope_boundaries_stated": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "The paper does not explicitly state what the results do not show. It does not acknowledge that simulated tool calls may not reflect real API behavior, that results on AI-generated tasks may not transfer to real deployments, or that the ReAct framework is only one of many agent architectures."
    190         }
    191       },
    192       "data_integrity": {
    193         "raw_data_available": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Raw data (tool definitions, attack/defense configurations in JSONL format) is available via the GitHub repository (data/all_normal_tools.jsonl and data/all_attack_tools.jsonl), allowing verification of benchmark construction."
    197         },
    198         "data_collection_described": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Appendix B describes in detail how all benchmark data was generated: 10 scenarios x 5 tasks using GPT-4, with normal tools and attack tools generated following specified schema (tool name, description, expected achievement, corresponding agent)."
    202         },
    203         "recruitment_methods_described": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "No human participants were involved. The benchmark data is AI-generated. The ethics statement (Appendix E) explicitly states 'No human subjects were involved in this study.'"
    207         },
    208         "data_pipeline_documented": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "The full pipeline from task generation (GPT-4) through tool simulation (JSONL files loaded as SimulatedTool/AttackerTool objects) to evaluation (tool invocation checking) is documented in Appendices B, C.2.5, and C.2.6."
    212         }
    213       },
    214       "conflicts_of_interest": {
    215         "funding_disclosed": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "No funding acknowledgment appears anywhere in the paper. There is no acknowledgments section listing grants, institutional support, or corporate sponsors."
    219         },
    220         "affiliations_disclosed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Author affiliations are clearly stated on the title page: four authors from Zhejiang University and four from Rutgers University. No products of theirs are being evaluated."
    224         },
    225         "funder_independent_of_outcome": {
    226           "applies": true,
    227           "answer": false,
    228           "justification": "No funding source is disclosed. The schema says 'NA if unfunded,' but the absence of a funding disclosure does not confirm the work is unfunded -- it may simply be undisclosed. Since we cannot confirm unfunded status, the criterion applies but cannot be satisfied."
    229         },
    230         "financial_interests_declared": {
    231           "applies": true,
    232           "answer": false,
    233           "justification": "No competing interests statement appears in the paper. The absence of such a declaration means financial interests cannot be ruled out."
    234         }
    235       },
    236       "contamination": {
    237         "training_cutoff_stated": {
    238           "applies": true,
    239           "answer": false,
    240           "justification": "Training data cutoffs are not stated for any of the 13 LLM backbones tested. While the benchmark tests agent behavior rather than factual recall, the models' training data may include similar attack patterns from public sources."
    241         },
    242         "train_test_overlap_discussed": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "No analysis of potential train/test overlap is provided. The benchmark tasks were generated by GPT-4 and some tested models (GPT-4o, GPT-4o-mini) are related, creating potential for systematic bias, but this is not discussed."
    246         },
    247         "benchmark_contamination_addressed": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "The benchmark uses prompt injection patterns from public sources (OWASP 2023, Liu et al. 2024) that models may have been trained on. This potential contamination of attack patterns is not discussed."
    251         }
    252       },
    253       "human_studies": {
    254         "pre_registered": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "No human participants were involved. The ethics statement (Appendix E) explicitly states 'No human subjects were involved in this study.'"
    258         },
    259         "irb_or_ethics_approval": {
    260           "applies": false,
    261           "answer": false,
    262           "justification": "No human participants were involved per the ethics statement (Appendix E)."
    263         },
    264         "demographics_reported": {
    265           "applies": false,
    266           "answer": false,
    267           "justification": "No human participants were involved per the ethics statement (Appendix E)."
    268         },
    269         "inclusion_exclusion_criteria": {
    270           "applies": false,
    271           "answer": false,
    272           "justification": "No human participants were involved per the ethics statement (Appendix E)."
    273         },
    274         "randomization_described": {
    275           "applies": false,
    276           "answer": false,
    277           "justification": "No human participants were involved per the ethics statement (Appendix E)."
    278         },
    279         "blinding_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants were involved per the ethics statement (Appendix E)."
    283         },
    284         "attrition_reported": {
    285           "applies": false,
    286           "answer": false,
    287           "justification": "No human participants were involved per the ethics statement (Appendix E)."
    288         }
    289       },
    290       "cost_and_practicality": {
    291         "inference_cost_reported": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "No inference costs, API costs, or latency figures are reported. Testing 13 LLM backbones across hundreds of attack configurations (including closed-source API models) would involve substantial costs, but these are not quantified."
    295         },
    296         "compute_budget_stated": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "Total computational budget (GPU hours, API costs, total API calls) is not stated anywhere in the paper. Given the scale of experiments (13 LLMs x multiple attack types x hundreds of tasks), this is a notable omission."
    300         }
    301       }
    302     },
    303     "summary": {
    304       "agreement_rate_pct": 99.0,
    305       "total_disagreements": 1,
    306       "by_direction": {
    307         "sonnet_generous": 0,
    308         "opus_generous": 0,
    309         "applies_boundary": 1,
    310         "interpretive": 0
    311       },
    312       "notes": "Extremely high agreement (99.0%). The single disagreement is on funder_independent_of_outcome: Sonnet treats the absence of a funding disclosure as confirming unfunded status (applies=false), while Opus treats it as unknown funding status requiring the criterion to apply (applies=true, answer=false). This is a genuine applies-boundary edge case where the schema says 'NA if unfunded' but does not specify how to handle unknown funding status. Sonnet's scan is otherwise thorough and well-calibrated, with no generosity or strictness bias detected."
    313     }
    314   }
    315 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs