ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

calibration.json (19122B)


      1 {
      2   "paper_slug": "adaptive-attacks-bypass-defenses-2025",
      3   "total_questions": 50,
      4   "agreement_count": 49,
      5   "disagreement_count": 1,
      6   "agreement_rate": 0.98,
      7   "disagreements": [
      8     {
      9       "category": "setup_transparency",
     10       "question": "prompts_provided",
     11       "sonnet_applies": true,
     12       "sonnet_answer": true,
     13       "opus_applies": true,
     14       "opus_answer": false,
     15       "opus_justification": "The paper shows examples of successful attack triggers (e.g., the RL trigger against RPO in Section 5.1, the search trigger against Prompt Sandwiching), but these are outputs of the attack optimization process, not the prompts used to drive the experiments. The actual experimental prompts — the LmMutator system prompt, the critic LLM prompt, the scoring function prompts — are described only in natural language (e.g., 'consists of sections: broad context, the attacker's task, and other miscellaneous information' in Appendix D). Per the schema, 'If prompts are described only in natural language without the actual text, NO.' The LmMutator system prompt is stated to have been 'generated by another LLM and then manually edited' but its actual text is not provided.",
     16       "sonnet_justification": "The paper provides actual prompt examples used in attacks (e.g., the successful RL trigger against RPO in Section 5.1, the search trigger against Prompt Sandwiching, the RL trigger against Data Sentinel in Appendix C.1). These are the actual prompts sent to models, not just templates.",
     17       "direction": "sonnet_generous"
     18     }
     19   ],
     20   "opus_checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No code repository URL is provided anywhere in the paper. The attack methods are described in detail in Appendices A-D but no implementation is released."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper uses publicly available benchmarks: HarmBench, AgentDojo, OpenPromptInject, and the Alpaca/Davinci dataset (with an explicit HuggingFace URL in Section B.1). These are standard public benchmarks that readers can obtain independently."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No requirements file, Dockerfile, or detailed environment specification is provided. The paper mentions using Gemini-2.5 Pro as LmMutator and various LLM APIs but does not provide library versions or reproducible environment details."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The appendices describe attack methods at a conceptual level (PSSU loop, MAP Elites controller, LmMutator design) but there are no step-by-step commands, scripts, or README to allow a researcher to reproduce results without significant guesswork."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All ASR results in Tables 2-7 and Figure 1 are point estimates. No confidence intervals, error bars, or standard deviations are reported anywhere in the paper."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper compares ASRs between static/weak attacks and adaptive attacks (e.g., 0% vs. 99% for Spotlighting) but no statistical significance tests (p-values, bootstrap tests, etc.) are performed."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper reports absolute ASR values for both static attacks and adaptive attacks with clear baselines (e.g., 'near-zero ASR' for static vs. '>90% ASR' for adaptive). Figure 1 and Table 7 show both baseline and attack ASR values, providing sufficient context to assess the magnitude of improvement."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The paper uses 80 samples from AgentDojo (Slack, Travel, Workspace suites) without justifying this sample size. No power analysis is discussed for any of the experiments."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Table 7 reports 'Median Num. Queries' for search attacks but no standard deviation, IQR, or other spread measure. ASR figures are single point estimates with no variance reported across runs."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Each defense is evaluated against both the original paper's static/weak attack (baseline) and the authors' adaptive attacks. Figure 1 shows side-by-side ASR comparison for all 12 defenses."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The 12 defenses evaluated are all from 2024-2025 (e.g., Circuit Breakers 2024, StruQ 2024, MetaSecAlign 2025, Data Sentinel 2025, MELON 2025). The baseline attacks used are those from the original defense papers."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The paper presents four attack families (gradient, RL, search, human) but does not systematically ablate individual components within each attack. For instance, the search attack's controller, mutator, and scorer components are not individually removed to measure their contribution."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The evaluation uses multiple metrics: attack success rate (ASR), number of queries until first success (Table 7, Figure 6), and utility of the defended system (Table 7 'Utility' column)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The paper runs a human red-teaming competition with 500+ participants and $20,000 in prizes (Appendix E). Human attacks are directly compared against automated attacks across 29 scenarios."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Results are reported on standard benchmark datasets (HarmBench, AgentDojo, OpenPromptInject) that are separate from the attack design process. The attacks are adaptive to defenses, not tuned to specific test examples."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by defense type (Prompting, Training, Filtering Model, Secret Knowledge), by individual defense, and by base model within each defense (Table 7)."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Reward hacking is discussed as a failure mode (Appendix C.1, Figure 5) where the RL attacker finds strategies that score well but do not constitute real attacks. PIGuard's lower ASR (71%) relative to other defenses is also acknowledged."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Gradient-based attacks are described as 'generally unreliable' with a recommendation to use text-space attacks instead (Section 4). The search attack reaches only 69% ASR vs. 100% for human red-teamers (Appendix F)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims 12 defenses are bypassed with ASR 'above 90% for most.' This is supported by Figure 1 and Tables 1-7. The qualification 'for most' is accurate since PIGuard reaches 71% and a few other settings fall below 90%."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper's causal claim is that adaptive attacks (vs. static/weak attacks) cause higher ASR against the same defenses. This is demonstrated through controlled comparisons: same defense, same benchmark, different attack method. This is adequate causal identification for this claim type."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Section 5 explicitly states 'the goal of this section is not to provide a full evaluation of defenses across all attacks or to compare the effectiveness of multiple defenses.' Results are scoped to the 12 tested defenses on specific benchmarks."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Appendix F.1 discusses specific challenges in interpreting results: human skill heterogeneity may artificially lower average ASR, differing query budgets complicate human-vs-automated comparison, and collective success may not reflect individual performance. Reward hacking (Appendix C.1) is discussed as an alternative explanation for apparent attack success."
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper uses marketing names like 'Gemini-2.5 Pro', 'GPT-5', 'Llama-3.3-70B', and 'Gemini-2.5 Flash' without specific API version dates or snapshot identifiers. Per the schema, marketing names without a snapshot date or API version do not count."
    144       },
    145       "prompts_provided": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The paper shows examples of successful attack triggers (outputs of the optimization), but the actual experimental setup prompts are not provided. The LmMutator system prompt is described only in natural language in Appendix D ('consists of sections: broad context, the attacker's task, and other miscellaneous information') — the actual text is not given. The critic LLM prompts and scoring function prompts are also described conceptually. Per the schema, natural language descriptions without actual prompt text are NO."
    149       },
    150       "hyperparameters_reported": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "Some parameters are stated (32 independent sessions, 5 rounds for RL; 8 candidates per mutation for search) but key hyperparameters are missing: temperature, learning rate, batch size for RL training, reward weights. The LmMutator uses 'maximum thinking budget' which is not a precise numerical value."
    154       },
    155       "scaffolding_described": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 4 describes the four-step PSSU loop. Appendices A-D provide detailed descriptions of each attack family's scaffolding: the MAP Elites controller, LmMutator design, island-based database, scoring mechanisms, and feedback loops."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section B.1 describes benchmark details: AgentDojo subset of 80 samples from Slack, Travel, and Workspace suites with explanation for why Banking was excluded. OpenPromptInject task pairs and the Alpaca/Davinci dataset source URL are provided."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. Section 6 is titled 'Lesson and Discussion' and Appendix F.1 is 'Challenges.' While limitations are discussed throughout, there is no dedicated section for them."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Appendix F.1 identifies specific threats: human skill heterogeneity affecting average ASR, differing costs between human and automated queries complicating comparison, and the caveat that collective human success does not reflect individual performance. These are specific to this study."
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 5 states 'the goal of this section is not to provide a full evaluation of defenses across all attacks or to compare the effectiveness of multiple defenses.' The paper explicitly scopes its contribution to demonstrating that adaptive attacks exist and are effective."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The raw attack outputs, successful trigger strings, and model response logs are not publicly available. Only selected examples of successful triggers are shown in the paper."
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The human red-teaming competition setup is described in Appendix E: online platform, 500+ participants, $20,000 prize structure, AgentDojo evaluation environment, scoring mechanism with token penalty, and human judge adjudication for edge cases."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "The paper states 'we run a human AI red-teaming competition' with 500+ participants but does not describe how participants were recruited, what channels or communities were used, or whether the participant pool introduces selection bias."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "The pipeline from attack submission to ASR is documented: Appendix B describes benchmark setup, Appendix C describes scoring functions for each defense, and Appendix E.2 describes how human submissions are evaluated (automatic + human adjudication for appeals)."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No acknowledgments section listing grants, funding agencies, or corporate sponsors. The $20,000 competition prize funding source is not disclosed."
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Author affiliations are listed on page 1: OpenAI, Anthropic, Google DeepMind, HackAPrompt, Northeastern University, ETH Zurich, AI Security Company, and MATS."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "Authors from Google DeepMind use Gemini-2.5 Pro as the LmMutator and evaluate defenses on Google models. OpenAI employees participate while GPT-5 is evaluated. Anthropic employees participate. These are non-independent relationships between authors/funders and outcomes."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No competing interests statement or financial conflict-of-interest declaration is present. Authors at OpenAI, Anthropic, and Google have potential financial interests that are not formally declared."
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "The paper evaluates defense robustness against adversarial attacks, not pre-trained model knowledge on benchmarks. Contamination of training data is not relevant to measuring whether adaptive attacks can bypass defenses."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "Same as above: the paper tests whether attacks can bypass defenses, not whether models have memorized test examples. Train/test overlap is not applicable to this evaluation design."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "Benchmark contamination is not relevant to adaptive attack evaluations of security defenses. The paper tests defense mechanisms, not model knowledge."
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "The human red-teaming competition involved 500+ human participants, but no pre-registration link (OSF, AsPredicted, etc.) is provided."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "The Ethics Statement mentions voluntary participation and anonymized data collection, but no IRB or ethics board approval is mentioned."
    254       },
    255       "demographics_reported": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "The paper states '500+ participants' but provides no demographic information about them — no experience level, geographic distribution, or professional background."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": true,
    262         "answer": false,
    263         "justification": "No inclusion or exclusion criteria for competition participants are stated. There is no description of eligibility requirements or screening."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "The human red-teaming is a competition where participants self-select which challenges to attempt, not an experimental study with random assignment to conditions. Randomization is not applicable."
    269       },
    270       "blinding_described": {
    271         "applies": true,
    272         "answer": true,
    273         "justification": "Appendix E.3 states 'the name of the model is replaced with a pseudonym, and participants are unaware of defenses potentially being deployed,' constituting partial blinding of participants to both model identity and defense presence."
    274       },
    275       "attrition_reported": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "The paper mentions 500+ total participants and notes 'not every participant attempts every scenario,' but the number of participants per scenario and dropout/completion rates are not systematically reported."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "Despite using Gemini-2.5 Pro with 'maximum thinking budget' as LmMutator, running 32 independent sessions per attack instance, and testing across 12 defenses with multiple benchmarks, no API costs, token counts, or latency figures are reported."
    286       },
    287       "compute_budget_stated": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No total computational budget, GPU hours, or API spend is stated. The competition prizes ($20,000) are mentioned but the infrastructure and compute costs of running the attacks are not quantified."
    291       }
    292     }
    293   }
    294 }

Impressum · Datenschutz