ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

calibration.json (22811B)


      1 {
      2   "paper_slug": "adaptive-attacks-break-2025",
      3   "total_questions": 50,
      4   "agreement_count": 47,
      5   "disagreement_count": 3,
      6   "agreement_rate": 0.94,
      7   "disagreements": [
      8     {
      9       "category": "contamination",
     10       "question": "training_cutoff_stated",
     11       "sonnet_applies": true,
     12       "sonnet_answer": false,
     13       "opus_applies": false,
     14       "opus_answer": false,
     15       "opus_justification": "The schema states applies=false (NA) when 'the paper does not evaluate a pre-trained model's capability on any benchmark' and gives the example 'studies that test defenses/tools rather than model knowledge.' This paper tests defense robustness against adaptive IPI attacks, not model knowledge or capability. The ASR measures whether the model follows injected instructions under various defense conditions, not whether the model 'knows' benchmark answers. Contamination (memorization of benchmark content) is not the relevant concern here.",
     16       "sonnet_justification": "The paper uses gpt-4-0613 for simulating tool responses but does not state its training data cutoff. For Vicuna-7B and Llama3-8B, no training cutoffs are stated. The InjecAgent benchmark could potentially overlap with training data.",
     17       "direction": "applies_boundary"
     18     },
     19     {
     20       "category": "contamination",
     21       "question": "train_test_overlap_discussed",
     22       "sonnet_applies": true,
     23       "sonnet_answer": false,
     24       "opus_applies": false,
     25       "opus_answer": false,
     26       "opus_justification": "Same reasoning as training_cutoff_stated. The schema explicitly exempts 'studies that test defenses/tools rather than model knowledge' from contamination questions. This paper evaluates defense mechanisms against adaptive attacks, not the model's inherent capability on benchmark tasks. Whether the models saw InjecAgent during training does not invalidate the defense evaluation since the attacks are adaptively optimized per test case regardless.",
     27       "sonnet_justification": "No analysis of whether InjecAgent benchmark examples appeared in the training data of Vicuna-7B or Llama3-8B is provided. The InjecAgent benchmark was published in 2024, and the models may have training data that overlaps with it.",
     28       "direction": "applies_boundary"
     29     },
     30     {
     31       "category": "contamination",
     32       "question": "benchmark_contamination_addressed",
     33       "sonnet_applies": true,
     34       "sonnet_answer": false,
     35       "opus_applies": false,
     36       "opus_answer": false,
     37       "opus_justification": "Same reasoning as the other contamination questions. The schema says NA applies when 'the paper does not evaluate a pre-trained model on any benchmark (same NA rule as training_cutoff_stated).' This paper tests adaptive attacks against defenses, not model knowledge. The relevant evaluation is whether defense mechanisms can withstand adversarial attacks, which is independent of whether the models have seen the specific benchmark examples. Per-example adversarial optimization makes memorization irrelevant to attack effectiveness.",
     38       "sonnet_justification": "InjecAgent was published at ACL 2024 and the Llama-3.1-8B-Instruct model was released in 2024. The potential for the benchmark to appear in the model's training data is not addressed.",
     39       "direction": "applies_boundary"
     40     }
     41   ],
     42   "opus_checklist": {
     43     "artifacts": {
     44       "code_released": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The abstract states 'The code is available at https://github.com/uiuc-kang-lab/AdaptiveAttackAgent', providing a working GitHub repository URL."
     48       },
     49       "data_released": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper uses the publicly available InjecAgent benchmark (Zhan et al., 2024, ACL 2024) and AgentDojo benchmark (Debenedetti et al., 2024). No proprietary data was created; all data sources are publicly accessible."
     53       },
     54       "environment_specified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "Appendix A mentions 'a single NVIDIA A100 GPU' and provides LoRA hyperparameters, but no requirements.txt, Dockerfile, conda environment, or explicit library versions are provided. Hardware is specified but software environment dependencies are not."
     58       },
     59       "reproduction_instructions": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "While code is released on GitHub and Appendix A provides hyperparameters (Table 5), the paper does not include step-by-step reproduction instructions. A reader would need to consult the GitHub repository for reproduction guidance."
     63       }
     64     },
     65     "statistical_methodology": {
     66       "confidence_intervals_or_error_bars": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Table 4 reports mean ASRs with standard deviation using +/- notation (e.g., 'Direct Harm Vicuna-7B: 0.90 +/- 0.06'). These provide uncertainty quantification averaged across the eight defenses."
     70       },
     71       "significance_tests": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No statistical significance tests (t-tests, p-values, Mann-Whitney U, etc.) are reported anywhere in the paper. Comparisons between adaptive and non-adaptive attacks, and between defenses, rely solely on descriptive statistics."
     75       },
     76       "effect_sizes_reported": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper reports ASR values with baseline context throughout. For example, original ASR of 12% with adversarial finetuning rising to >50% after adaptive attack (Figure 2). Table 3 compares adaptive vs. non-adaptive strings (24% vs. 76% for perplexity filtering). The magnitude of improvement is consistently contextualized."
     80       },
     81       "sample_size_justified": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The evaluation uses 100 test cases (50 from each attack type), randomly selected from 1,054 cases in InjecAgent. No power analysis or justification for why 100 cases is sufficient to support the paper's conclusions is provided."
     85       },
     86       "variance_reported": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Table 4 reports standard deviations (+/-) for ASR-defense and ASR-adaptive attack averaged across defenses (e.g., '0.39 +/- 0.19'). Variance across the eight defenses is reported, providing spread information."
     90       }
     91     },
     92     "evaluation_design": {
     93       "baselines_included": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Figure 2 shows original attacks without any defense or adaptive attacks (gray bars) as baseline. Table 3 compares adaptive vs. non-adaptive adversarial strings. Multiple meaningful baselines are included."
     97       },
     98       "baselines_contemporary": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The eight defenses evaluated include recent work from 2023-2024: perplexity filtering (Jain et al., 2023), adversarial finetuning (Piet et al., 2024), LLM-based detection (Armstrong, 2023), fine-tuned DeBERTaV3 detector (ProtectAI, 2024). These represent the current IPI defense literature."
    102       },
    103       "ablation_study": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Table 3 ablates the adaptive component by comparing adaptive vs. non-adaptive adversarial strings for perplexity filtering and paraphrasing defenses. Figure 5 cross-evaluates attack specificity, showing that defense-specific attacks outperform generic attacks. These analyses isolate the contribution of the adaptive training component."
    107       },
    108       "multiple_metrics": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper reports multiple metrics: ASR-defense, ASR-adaptive attack, target rate (Section 5.1, Figure 3), detection rate (Table 2), and valid rate (Figure 4, Section 6.2). This provides a multi-dimensional evaluation."
    112       },
    113       "human_evaluation": {
    114         "applies": false,
    115         "answer": false,
    116         "justification": "This is a technical security evaluation testing automated attack and defense systems. Attack success is measured by whether the agent executes a specific malicious tool, which is a deterministic criterion. Human evaluation is clearly irrelevant to these claims."
    117       },
    118       "held_out_test_set": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The 100 test cases are randomly sampled from InjecAgent for evaluation. For adversarial finetuning, training data consists of unsuccessful attacks (215 Vicuna cases, 816 Llama3 cases) from the full benchmark, which is separate from the 100-case test set. While adversarial string optimization is per-example (standard methodology for adaptive attacks), the defense evaluation uses properly separated training and test data."
    122       },
    123       "per_category_breakdown": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Table 4 provides detailed breakdown by attack type (direct harm, data stealing step 1, data stealing step 2) and agent type (Vicuna-7B vs. Llama3-8B). Figure 2 breaks down results by all eight individual defenses. This is thorough categorical analysis."
    127       },
    128       "failure_cases_discussed": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Section 6.3 discusses why adaptive attacks provide minimal improvement in the second step of data-stealing attacks. Section 4 discusses AutoDAN (genetic algorithm version) struggling in the IPI setting. Appendix B reports reduced effectiveness on AgentDojo. Multiple failure modes are identified and analyzed."
    132       },
    133       "negative_results_reported": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "The paper reports: (1) genetic-algorithm-based AutoDAN 'struggles to adapt to the IPI setting' (Section 4); (2) adaptive attacks are 'not as effective' on AgentDojo vs. InjecAgent (Appendix B); (3) GCG over adversarial finetuning achieves high target rate but relatively low ASR (Section 6.1). Genuine negative findings are reported."
    137       }
    138     },
    139     "claims_and_evidence": {
    140       "abstract_claims_supported": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The abstract claims 'consistently achieving an attack success rate of over 50%' against all eight defenses. Figure 2 confirms all ASR-adaptive attack values exceed 50% for both agents across all defenses. The abstract's claims are well-supported by the results."
    144       },
    145       "causal_claims_justified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The paper's causal claim is that adaptive attacks cause high ASRs against defenses. This is tested through controlled experiments: each adaptive attack is specifically designed for each defense (the manipulated variable), with cross-evaluation (Figure 5) showing defense-specific attacks outperform generic attacks. Table 3 compares adaptive vs. non-adaptive strings as a controlled single-variable manipulation. The experimental design is adequate for causal inference."
    149       },
    150       "generalization_bounded": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The paper tests only two relatively small models (Vicuna-7B and Llama3-8B), 100 of 1,054 InjecAgent cases, and assumes white-box access. The title 'Adaptive Attacks Break Defenses Against Indirect Prompt Injection Attacks on LLM Agents' and the abstract's claim about 'critical vulnerabilities in current defenses' are broad. While the Limitations section (Section 10) acknowledges white-box access and limited defense coverage, the abstract and conclusion do not adequately bound generalization to the tested setting."
    154       },
    155       "alternative_explanations_discussed": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "The paper does not substantively discuss alternative explanations for why adaptive attacks succeed. Possible alternatives (models are too small/weak, InjecAgent is unrepresentative, white-box access is unrealistic for real attacks) are not explored. The limitations section mentions the white-box assumption and limited defense set but does not consider these as alternative interpretations of the observed results."
    159       }
    160     },
    161     "setup_transparency": {
    162       "model_versions_specified": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The paper specifies Vicuna-7B v1.5 (https://huggingface.co/lmsys/vicuna-7b-v1.5), Llama-3.1-8B-Instruct (https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct), and gpt-4-0613 for simulating tool responses. Specific model versions with HuggingFace URLs are provided."
    166       },
    167       "prompts_provided": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Appendix C provides full text of all prompts: ReAct agent prompt (C.1), LLM-based detector prompt (C.2), instructional prevention (C.3), sandwich prevention (C.4), and paraphrasing prompt (C.5). Template placeholders (e.g., {input}, {tool_response}) are filled from the publicly available InjecAgent benchmark, allowing full reconstruction."
    171       },
    172       "hyperparameters_reported": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Table 5 (Appendix A) provides detailed hyperparameters for all experiments: attack string position, token length, training steps per defense/agent combination. LoRA parameters specified: rank=32, alpha=64, dropout=0.05, learning rate=2.5e-5, 15 epochs. Multi-objective GCG alpha=0.5 with early stopping at 100 steps without loss reduction."
    176       },
    177       "scaffolding_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The paper describes the full ReAct agent framework with complete prompt in Appendix C.1, including the thought/action/observation loop. Tool integration, evaluation procedure (single-turn with up to two tool interactions), and the finetuned agent's tool-calling structure are all described. The agentic scaffolding is comprehensively documented."
    181       },
    182       "data_preprocessing_documented": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 5.1 documents: random selection of 50 test cases per attack type from 1,054 total (yielding 25 unique direct harm attacks, 27 data stealing attacks, 16 user scenarios). For adversarial finetuning, the full pipeline is described: evaluate all cases, filter successful attacks and invalid outputs, keep unsuccessful attacks (215 for Vicuna, 816 for Llama3)."
    186       }
    187     },
    188     "limitations_and_scope": {
    189       "limitations_section_present": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Section 10 is titled 'Limitations' and contains four detailed bullet points covering first-step-only attacks, white-box assumption, no defense combinations, and non-exhaustive defense coverage."
    193       },
    194       "threats_to_validity_specific": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The four limitations are specific to this study: (1) adversarial strings only control first-step agent action, with data showing limited improvement in step 2; (2) white-box access assumption limits real-world applicability; (3) no testing of combined defenses; (4) does not cover LLM self-evaluation or alternative finetuning methods. Each limitation is concrete and study-specific."
    198       },
    199       "scope_boundaries_stated": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "The limitations explicitly state: 'does not account for the combination of defenses,' does not cover 'LLM self-evaluation,' 'alternative model finetuning methods,' and acknowledges need for 'black-box and grey-box attacks.' Appendix B notes reduced effectiveness in long-context settings, further bounding scope."
    203       }
    204     },
    205     "data_integrity": {
    206       "raw_data_available": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The InjecAgent benchmark is publicly available (published at ACL 2024). The code is released at GitHub (https://github.com/uiuc-kang-lab/AdaptiveAttackAgent). Together these enable independent verification of results. The specific 100-case subset selection is documented (random 50 per attack type)."
    210       },
    211       "data_collection_described": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Section 5.1 describes the dataset: InjecAgent benchmark with 1,054 test cases covering two attack types (direct harm and data stealing). Random sampling of 50 per attack type yields 100 test cases with 25 unique direct harm attacks, 27 data stealing attacks, and 16 user scenarios."
    215       },
    216       "recruitment_methods_described": {
    217         "applies": false,
    218         "answer": false,
    219         "justification": "No human participants in this study. The paper uses automated benchmark test cases from InjecAgent, not human recruits. The data source is a standard public benchmark."
    220       },
    221       "data_pipeline_documented": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "The evaluation pipeline is documented: test case selection from InjecAgent, adversarial string optimization per case (with detailed hyperparameters in Table 5), defense application, attack evaluation with ASR computation. For adversarial finetuning, the pipeline from data creation (filtering unsuccessful attacks from full benchmark) through LoRA finetuning to evaluation is described in Section 3.3 and Appendix A."
    225       }
    226     },
    227     "conflicts_of_interest": {
    228       "funding_disclosed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "The Acknowledgements section states: 'We would like to acknowledge the Open Philanthropy project for funding this research in part.' Funding source is explicitly disclosed."
    232       },
    233       "affiliations_disclosed": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "Author affiliations are clearly stated in the header: Qiusi Zhan, Richard Fang, and Daniel Kang at University of Illinois Urbana-Champaign; Henil Shalin Panchal at Nirma University. The paper evaluates open-source models and public defenses with no commercial product affiliation."
    237       },
    238       "funder_independent_of_outcome": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "Open Philanthropy is a philanthropic organization focused on effective giving. It has no commercial interest in whether IPI defenses are found to be robust or vulnerable. The funder is independent of the outcome."
    242       },
    243       "financial_interests_declared": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No competing interests or financial interests statement is present in the paper. Per the schema: 'If there is no competing interests statement at all, NO -- absence of disclosure is not the same as absence of conflict.'"
    247       }
    248     },
    249     "contamination": {
    250       "training_cutoff_stated": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "The schema specifies applies=false (NA) when 'the paper does not evaluate a pre-trained model's capability on any benchmark (e.g., mining studies, interview studies, surveys, or studies that test defenses/tools rather than model knowledge).' This paper evaluates defense robustness against adaptive IPI attacks. The ASR measures whether defenses can prevent model manipulation, not model knowledge or capability. Contamination (memorized benchmark answers) is not the relevant concern for defense evaluation."
    254       },
    255       "train_test_overlap_discussed": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "Same NA rule applies. This paper tests defense mechanisms against adaptive attacks, not pre-trained model capability on benchmark tasks. Whether the models have seen InjecAgent during training does not invalidate the defense evaluation, as the attacks are adaptively optimized per test case with white-box access regardless."
    259       },
    260       "benchmark_contamination_addressed": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "Same NA rule applies. The paper evaluates adaptive attacks against defenses, not model knowledge. Per-example adversarial string optimization with white-box access makes benchmark memorization largely irrelevant to the attack methodology and defense evaluation."
    264       }
    265     },
    266     "human_studies": {
    267       "pre_registered": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants. This is a purely automated technical evaluation of attack and defense systems using benchmark test cases."
    271       },
    272       "irb_or_ethics_approval": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants. Section 9 discusses ethical considerations regarding dual-use research but no IRB approval is relevant or required."
    276       },
    277       "demographics_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "inclusion_exclusion_criteria": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       },
    287       "randomization_described": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human participants in this study."
    291       },
    292       "blinding_described": {
    293         "applies": false,
    294         "answer": false,
    295         "justification": "No human participants in this study."
    296       },
    297       "attrition_reported": {
    298         "applies": false,
    299         "answer": false,
    300         "justification": "No human participants in this study."
    301       }
    302     },
    303     "cost_and_practicality": {
    304       "inference_cost_reported": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "Appendix A states that each adversarial string is trained on 'a single NVIDIA A100 GPU for approximately 30 minutes,' but this is training/optimization cost, not inference cost. API costs for gpt-4-0613 calls in data stealing evaluation, total inference cost per example, and wall-clock time for the full evaluation pipeline are not reported."
    308       },
    309       "compute_budget_stated": {
    310         "applies": true,
    311         "answer": true,
    312         "justification": "Appendix A states: 'We train each adversarial string on a single NVIDIA A100 GPU for approximately 30 minutes.' Table 5 provides training steps per defense/agent combination. LoRA finetuning details (15 epochs) are also provided. This gives a reasonable picture of the total computational budget."
    313       }
    314     }
    315   }
    316 }

Impressum · Datenschutz