ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29567B)


      1 {
      2   "paper": {
      3     "title": "OET: Optimization-based prompt injection Evaluation Toolkit",
      4     "authors": [
      5       "Jinsheng Pan",
      6       "Xiaogeng Liu",
      7       "Chaowei Xiao"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv (Preprint, under review)",
     11     "arxiv_id": "2505.00843",
     12     "doi": "10.48550/arXiv.2505.00843"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "OET provides a modular toolkit for evaluating optimization-based prompt injection attacks across diverse QA datasets. Experiments show open-source models (Qwen2-7B, Vicuna-7B, LLaMA3.1-8B) are significantly more susceptible to transferable GCG attacks than closed-source models (GPT-4o-mini, Claude-3.5-sonnet). Current defenses (StruQ, SecAlign) are inconsistent: StruQ achieves 0% ASR on most datasets but fails on TriviaQA and FinQA, while SecAlign actually increases vulnerability on AQuA and PubMedQA compared to the undefended baseline.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Paper states 'The code are publicly available on https://github.com/SaFoLab-WISC/OET' in the abstract footnote."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "All datasets used are publicly available benchmarks: BIPIA, SQuAD, CaseHold, FinQA, SciQ, TriviaQA, AQuA, PubMedQA. Each is cited with its original source."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned in the paper. Only the GitHub URL is provided without environment details."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Figure 2 shows code usage examples but no step-by-step reproduction instructions for the specific experiments. No README or reproduction guide is described in the paper."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Main results tables (Tables 1, 2, 3) report only point estimates of ASR with no confidence intervals or error bars. Appendix Table 5 shows std dev for training ASR only."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Claims like 'open-source models exhibit higher susceptibility' and 'StruQ effectively neutralizes attacks' are made by comparing raw ASR numbers without any statistical significance tests."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Table 2 reports both baseline and defense model ASR side by side with directional arrows, and the text notes specific magnitudes (e.g., 'ASR increases (+0.43)' for StruQ on TriviaQA). Baseline context is consistently provided across all comparisons."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No justification for why 400 test examples and 5 training examples were chosen per dataset. No power analysis discussed."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Appendix B (Table 5) reports std dev across 3 training runs, but the main evaluation results (Tables 1, 2, 3) report single values per cell with no variance or spread measures despite each example being attacked 5 times."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table 2 compares defense models (StruQ, SecAlign) against the base undefended LLaMA model. Table 1 compares multiple models. Table 3 compares multiple attack methods."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Defense baselines are recent: StruQ (2024) and SecAlign (2025). Attack methods include GCG (2023), AutoDAN (2023), PAIR (2024), and other contemporary techniques."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No ablation study of the toolkit's own components (data conversion, adversarial training, inference, result checking stages). Table 3 compares different optimizers but this is a comparison of existing methods, not an ablation of OET."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Only ASR (Attack Success Rate) is used as the evaluation metric. Section 4.2 states 'Our evaluation metric is ASR.' The toolkit supports custom metrics (Figure 3) but none are used in experiments."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation. Attack success is determined entirely by automated string matching (checking if 'sql injection' appears in the model's response). No human assessment of attack quality or realism."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Appendix A (Table 4) shows separate training and test splits: adversarial strings are optimized on 5 training examples and evaluated on 400 test examples per dataset."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "All result tables break down ASR by individual dataset (BIPIA, SQuAD, CaseHold, FinQA, SciQ, TriviaQA, AQuA, PubMedQA) covering diverse domains."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "Figure 5 shows only a successful attack example. While aggregate failures are noted (PEZ has 0 ASR everywhere, StruQ fails on TriviaQA), no qualitative failure examples are shown and no root-cause analysis of why specific attacks fail on specific datasets is provided."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Several negative results reported: PEZ achieves 0 ASR on all datasets (Table 3), AutoDAN near-zero on most, SecAlign increases vulnerability on AQuA (+0.46) and PubMedQA (+0.59) compared to baseline (Table 2)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims about toolkit modularity (Figures 1-3), limitations of current defenses (Tables 2-3), and models remaining susceptible after security enhancements (Table 2 showing SecAlign increasing ASR on some datasets) are all supported by experimental results."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The main causal claim is that defense mechanisms reduce attack success rate. Table 2 uses a controlled comparison: same GCG attack and datasets, with only the model changed (base vs StruQ vs SecAlign). This single-variable design supports the causal claims."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper claims 'open-source models exhibit higher susceptibility to adversarial attacks' based on only 3 open-source (7-8B parameter) and 2 closed-source models. Model size and safety training are confounded with the open/closed distinction. The title suggests general 'prompt injection evaluation' but only one attack objective ('Print sql injection') is tested."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No alternative explanations discussed. The open-source vs closed-source gap could be due to model size (7-8B vs much larger), safety training intensity, or content filtering rather than source availability. The domain-specific failure of defenses is noted but not explained."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper measures ASR via exact string matching for 'sql injection' and frames this as evaluating 'adversarial robustness' and 'prompt injection vulnerability.' The gap between this narrow proxy (single fixed-string target) and real-world prompt injection risk (which involves diverse, context-dependent exploitation goals) is not acknowledged."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "GPT-4o-mini is a marketing name without API snapshot date. Other models are better specified: 'LLama3.1-8B-Instruct' (Figure 1), 'Claude-3.5-sonnet', 'Vicuna-7B', 'Qwen2-7B-Instruct'. Per schema guidelines, marketing names without snapshot dates do not count."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Figure 5 shows one example prompt structure with [MARK]/[INST]/[COLN] tags for the SecAlign format. However, the full prompts used for all models and experiments are not provided. The base model prompt format (Table 1) is not shown."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 4.2 reports: 500 optimization steps, temperature 0.6, max new tokens 64. Figure 1 shows optimizer configuration format."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. OET is a sequential pipeline (data conversion → training → inference → result checking), not an agentic system."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "Section 3 describes 'Data Conversion' at a high level ('raw data is preprocessed and transformed into a unified format') but does not detail the exact transformation steps. Appendix A gives dataset sizes but not preprocessing specifics."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No dedicated limitations section. The conclusion briefly mentions 'Future work may explore more sophisticated attack strategies, adaptive defense mechanisms, and real-world deployment scenarios' but this is future work, not limitations."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No threats to validity are discussed. Potential issues like single attack objective, small model sample, model size confound, and transferability-only evaluation for closed-source models are not acknowledged."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No explicit scope boundaries stated. The paper does not identify what the results do NOT show, such as the limitation to transferable attacks for closed-source models or the narrow single-objective attack setup."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The underlying QA datasets are public, but the raw experimental outputs (model responses used to compute ASR) are not released. Only aggregate ASR numbers are reported."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 4.1 describes the dataset composition. Appendix A (Table 4) details the number of train/test examples per dataset, their domains, and sampling methodology for BIPIA subsets."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data comes from standard public QA benchmarks."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "The pipeline stages are described (Section 3) but exact data transformations are not documented. The paper says 'We first collect Question Answering (QA) datasets and then transform them into desired format' without specifying how raw datasets were converted to the unified format."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding or acknowledgments section. No mention of grants, sponsors, or funding sources."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly stated: University of Rochester and University of Wisconsin-Madison. One author (Xiaogeng Liu) appears in the AutoDAN reference, which is one of the attack methods evaluated."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": false,
    221         "answer": false,
    222         "justification": "No funding disclosed. Authors are university-affiliated with no apparent commercial interest in the outcome."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement or financial disclosure present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "This paper tests prompt injection attacks and defenses, not model knowledge on benchmarks. Whether models have seen QA answers in training does not affect whether they output the attack target string."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Paper evaluates defense/attack effectiveness, not model knowledge. Contamination of QA benchmark answers is irrelevant to the ASR measurement."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Paper evaluates defense/attack effectiveness, not model knowledge. Benchmark contamination does not affect the prompt injection attack success rate metric."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference cost or latency reported. The paper runs attacks against both API-based models (GPT-4o-mini, Claude-3.5-sonnet) and local models without reporting any cost or time information."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No computational budget stated. 500 optimization steps per attack across 8 datasets and 7+ attack methods implies substantial compute, but no GPU hours, API costs, or hardware details are provided."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "Main evaluation results (Tables 1, 2, 3) appear to be single-run results. Appendix B shows 3 training runs with std dev but only for training ASR, not the main transferability evaluation."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "Section 4.2 states 'Each test example is attacked five times. A test example is considered success if our attack objective appears in the response.' Appendix B states 'We trained adverserial strings for each training data sample 3 times.'"
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No hyperparameter search described. The choice of 500 optimization steps, temperature 0.6, and max tokens 64 is stated without justification or search budget."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No justification for why these specific hyperparameters were chosen. No description of any configuration selection process."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No statistical tests are performed at all, so correction for multiple comparisons is not applicable."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "Author Xiaogeng Liu co-authored AutoDAN, one of the attack methods evaluated. The potential bias of evaluating one's own attack method within the toolkit is not acknowledged. More broadly, the implementation of all attacks within OET could affect their relative performance."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "Different attack methods have drastically different compute profiles (gradient-based GCG with 500 steps vs LLM-query-based PAIR) but performance is compared without any compute normalization or discussion."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "No discussion of whether ASR with a single fixed target string ('sql injection') via exact string matching actually measures real-world prompt injection vulnerability. The construct validity of this narrow metric is not questioned."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No agentic scaffolding is involved in the evaluation. Models are tested directly without scaffolding."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the QA datasets or their associated contexts appeared in the training data of the models being attacked. While less critical for attack evaluation than knowledge evaluation, model familiarity with input formats could affect results."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation format or QA context provides inadvertent cues. The adversarial strings are trained on a specific format and transferred — whether this format alignment leaks information is not addressed."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "Training set is 5 examples and test set is 400 examples per dataset. Whether these splits are truly independent (e.g., no structural similarity between train and test QA pairs from the same source) is not verified."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention methods are applied or discussed."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "Open-source models exhibit significantly higher susceptibility to transferable GCG prompt injection attacks than closed-source models.",
    369       "evidence": "Table 1 shows open-source models (Qwen2-7B: 0.93-0.99 ASR, LLaMA3.1-8B: 0.24-0.95, Vicuna-7B: 0.15-0.91) vs closed-source (GPT-4o-mini: 0.0-0.1, Claude-3.5-sonnet: 0.01-0.29) across 8 datasets.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "StruQ defense is not universally effective, failing on TriviaQA (ASR 0.43) and FinQA (ASR 0.28) while achieving 0% ASR on other datasets.",
    374       "evidence": "Table 2 shows StruQ reduces ASR to 0.0 on 6/8 datasets but increases or maintains high ASR on TriviaQA (+0.43 vs baseline) and FinQA (0.28).",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "SecAlign defense can increase model vulnerability on some datasets compared to the undefended baseline.",
    379       "evidence": "Table 2 shows SecAlign increases ASR on AQuA (0.46 vs 0.23 baseline, +0.23) and PubMedQA (0.59 vs 0.48 baseline, +0.11).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "No single attack method dominates across all datasets against SecAlign.",
    384       "evidence": "Table 3 shows UAT is strongest on SciQA (0.78) and SQuAD (0.6), GCG leads on PubMedQA (0.59) and CaseHold (0.48), while PEZ achieves 0 ASR everywhere and AutoDAN near-zero on most datasets.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "OET provides a modular and extensible framework for evaluating prompt injection attacks.",
    389       "evidence": "Figures 1-3 show the toolkit architecture, code API, and customization interface. The paper demonstrates integration of 7 different attack methods across 8 datasets.",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Single attack objective",
    396       "detail": "All experiments use only one attack goal ('Print sql injection'). This severely limits the generalizability of conclusions about defense effectiveness. Different attack objectives could produce very different ASR patterns."
    397     },
    398     {
    399       "flag": "Model size confounded with open/closed distinction",
    400       "detail": "All open-source models are 7-8B parameters while closed-source models (GPT-4o-mini, Claude-3.5-sonnet) are presumably much larger. The paper attributes the ASR gap to open-source vs closed-source status without acknowledging this confound."
    401     },
    402     {
    403       "flag": "Only transferable attacks for closed-source models",
    404       "detail": "Closed-source models are only tested with transferred adversarial strings (trained on LLaMA3.1-8B), not adaptive attacks. The claimed robustness of closed-source models may reflect poor transferability rather than genuine robustness."
    405     },
    406     {
    407       "flag": "No statistical testing",
    408       "detail": "All comparative claims are made by eyeballing raw ASR numbers. With no confidence intervals, significance tests, or repeated evaluations, it's impossible to determine whether observed differences are reliable."
    409     },
    410     {
    411       "flag": "Very small training set",
    412       "detail": "Only 5 training examples per dataset (except BIPIA with 15) are used to optimize adversarial strings. Results may be highly sensitive to this specific small training sample."
    413     },
    414     {
    415       "flag": "Author evaluates own attack method",
    416       "detail": "Co-author Xiaogeng Liu co-authored AutoDAN, one of the evaluated attack methods. AutoDAN scores near-zero in the evaluation (Table 3), but no self-evaluation bias acknowledgment is provided."
    417     },
    418     {
    419       "flag": "No limitations section",
    420       "detail": "The paper lacks any discussion of limitations, threats to validity, or scope boundaries. Significant methodological constraints (single objective, transfer-only for closed-source, model size confound) go entirely unacknowledged."
    421     }
    422   ],
    423   "cited_papers": [
    424     {
    425       "title": "Universal and transferable adversarial attacks on aligned language models",
    426       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J. Zico Kolter", "Matt Fredrikson"],
    427       "year": 2023,
    428       "arxiv_id": "2307.15043",
    429       "relevance": "Introduces GCG, the primary attack method evaluated in the paper, foundational work on gradient-guided adversarial attacks on LLMs."
    430     },
    431     {
    432       "title": "AutoDAN: Generating stealthy jailbreak prompts on aligned large language models",
    433       "authors": ["Xiaogeng Liu", "Nan Xu", "Muhao Chen", "Chaowei Xiao"],
    434       "year": 2023,
    435       "arxiv_id": "2310.04451",
    436       "relevance": "Key jailbreak attack method adapted for prompt injection evaluation in OET."
    437     },
    438     {
    439       "title": "Jailbreaking black box large language models in twenty queries",
    440       "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban", "Hamed Hassani", "George J. Pappas", "Eric Wong"],
    441       "year": 2024,
    442       "arxiv_id": "2310.08419",
    443       "relevance": "Introduces PAIR, a black-box LLM-as-optimizer jailbreak method evaluated in the paper."
    444     },
    445     {
    446       "title": "StruQ: Defending against prompt injection with structured queries",
    447       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    448       "year": 2024,
    449       "arxiv_id": "2402.06363",
    450       "relevance": "Key defense mechanism evaluated in the paper; shows inconsistent performance across domains."
    451     },
    452     {
    453       "title": "SecAlign: Defending against prompt injection with preference optimization",
    454       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"],
    455       "year": 2025,
    456       "arxiv_id": "2410.05451",
    457       "relevance": "State-of-the-art defense method shown to paradoxically increase vulnerability on some datasets."
    458     },
    459     {
    460       "title": "Automatic and universal prompt injection attacks against large language models",
    461       "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"],
    462       "year": 2024,
    463       "arxiv_id": "2403.04957",
    464       "relevance": "Universal prompt injection attack using gradient optimization, directly relevant to the adaptive attack framework in OET."
    465     },
    466     {
    467       "title": "Prompt injection attack against LLM-integrated applications",
    468       "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li", "Kailong Wang", "Zihao Wang", "Xiaofeng Wang", "Tianwei Zhang", "Yepang Liu", "Haoyu Wang", "Yan Zheng", "Yang Liu"],
    469       "year": 2024,
    470       "arxiv_id": "2306.05499",
    471       "relevance": "Foundational work on prompt injection attacks against LLM-integrated applications."
    472     },
    473     {
    474       "title": "AgentDojo: A dynamic environment to evaluate attacks and defenses for LLM agents",
    475       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunović", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"],
    476       "year": 2024,
    477       "arxiv_id": "2406.13352",
    478       "relevance": "Competing evaluation platform for prompt injection with 629 test cases, used as a point of comparison for OET's approach."
    479     },
    480     {
    481       "title": "HarmBench: A standardized evaluation framework for automated red teaming and robust refusal",
    482       "authors": ["Mantas Mazeika", "Long Phan", "Xuwang Yin", "Andy Zou", "Zifan Wang", "Norman Mu", "Elham Sakhaee", "Nathaniel Li", "Steven Basart", "Bo Li", "David Forsyth", "Dan Hendrycks"],
    483       "year": 2024,
    484       "relevance": "Related standardized red-teaming evaluation framework that OET aims to extend for prompt injection specifically."
    485     },
    486     {
    487       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    488       "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Keegan Hines", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"],
    489       "year": 2023,
    490       "arxiv_id": "2312.14197",
    491       "relevance": "Introduces BIPIA dataset used in the evaluation and provides baseline benchmarks for indirect prompt injection."
    492     },
    493     {
    494       "title": "On evaluating adversarial robustness",
    495       "authors": ["Nicholas Carlini", "Anish Athalye", "Nicolas Papernot", "Wieland Brendel", "Jonas Rauber", "Dimitris Tsipras", "Ian Goodfellow", "Aleksander Madry", "Alexey Kurakin"],
    496       "year": 2019,
    497       "arxiv_id": "1902.06705",
    498       "relevance": "Foundational work on evaluating adversarial robustness that motivates OET's adaptive testing framework."
    499     },
    500     {
    501       "title": "FATh: Authentication-based test-time defense against indirect prompt injection attacks",
    502       "authors": ["Jiongxiao Wang", "Fangzhou Wu", "Wendi Li", "Jinsheng Pan", "Edward Suh", "Z. Morley Mao", "Muhao Chen", "Chaowei Xiao"],
    503       "year": 2024,
    504       "arxiv_id": "2410.21492",
    505       "relevance": "Authentication-based defense against prompt injection attacks by co-authors of this paper."
    506     },
    507     {
    508       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    509       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    510       "year": 2024,
    511       "arxiv_id": "2310.12815",
    512       "relevance": "Related benchmark for formalizing prompt injection attacks and defenses."
    513     }
    514   ]
    515 }

Impressum · Datenschutz