ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (31699B)


      1 {
      2   "paper": {
      3     "title": "Goal-Guided Generative Prompt Injection Attack on Large Language Models",
      4     "authors": [
      5       "Chong Zhang",
      6       "Mingyu Jin",
      7       "Qinkai Yu",
      8       "Chengzhi Liu",
      9       "Haochen Xue",
     10       "Xiaobo Jin"
     11     ],
     12     "year": 2024,
     13     "venue": "Industrial Conference on Data Mining",
     14     "arxiv_id": "2404.07234",
     15     "doi": "10.1109/ICDM59182.2024.00119"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval", "theoretical"],
     20   "key_findings": "The paper proposes G2PIA, a query-free black-box prompt injection attack that maximizes KL-divergence between clean and adversarial text posterior distributions, proving this is equivalent to maximizing Mahalanobis distance under Gaussian assumptions. Experiments on 7 LLMs (GPT-3.5/4, Llama-2 variants) and 4 QA datasets show ASR up to 81.46% (SQuAD2.0 with GPT-3.5-Turbo), outperforming BertAttack, TextFooler, and other baselines. Parameter sensitivity analysis finds optimal attack at ε=0.2, γ=0.5, and transferability analysis shows GPT-4-Turbo-generated attacks transfer most effectively.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. The paper describes the method algorithmically but does not release an implementation."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper uses publicly available datasets: GSM8K, SQuAD2.0, MATH, and Web-based QA, all of which are standard public benchmarks. However, the generated adversarial examples are not released."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper mentions using BERT for embeddings and word2vec for semantic distance but does not specify library versions or environment details."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are provided. The method is described at an algorithmic level (Sections III-D through III-F) but there are no scripts, commands, or reproducibility guide."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Tables I, II, and III report only point estimates for Aclean, Aattack, and ASR. No confidence intervals, error bars, or ± notation appears anywhere in the results."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims their method 'achieves the best results on both data sets' (Section IV-C) and outperforms baselines, but provides no statistical significance tests (no p-values, t-tests, or similar). Comparisons are based solely on raw ASR numbers."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper reports Clean Accuracy, Attack Accuracy, and ASR in context for each method, allowing direct comparison. For example, 'our algorithm with ASR 44.87% compared to BertAttack's 33.46%' (Section IV-C), providing baseline context for the magnitude of improvement."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Section IV-A states 'We randomly selected 300 examples from the following dataset' but provides no justification for why 300 was chosen. No power analysis or discussion of whether 300 is sufficient."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No standard deviations, variance, or spread measures are reported. It is unclear whether experiments were run multiple times or only once. All results are single point estimates."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Table II compares G2PIA against 6 baseline methods: BertAttack, DeepWordBug, TextFooler, TextBugger, Stress Test, and CheckList on two datasets."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Baselines include PromptBench (2023), BertAttack (2020), CheckList (2020), and TextFooler (2020). PromptBench is contemporary, and the others represent the established state of the art for black-box text attacks."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Table III presents ablation studies on GSM8K and Web-based QA with GPT-3.5-Turbo, comparing random position prompt injection (ASR 29.18%), random component replacement (ASR 18.33%), and the full method (ASR 47.60%)."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Three evaluation metrics are used: Clean Accuracy, Attack Accuracy, and Attack Success Rate (ASR). The relationship between them is formally defined in Section IV-A3."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No human evaluation is conducted. The paper claims attacks are 'imperceptible' (Section III-A) but this claim is never validated by human judges assessing whether the injected text is detectable."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "The paper randomly selects 300 examples per dataset for testing. Parameters ε=0.2, γ=0.5 are selected based on Table V using GSM8K with GPT-3.5, then applied to all datasets/models. There is no explicit separation of a validation set for parameter tuning versus a held-out test set for final reporting."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Table I provides per-dataset breakdowns (GSM8K, Web-based QA, SQuAD2.0, Math) and per-model breakdowns (7 LLMs). Table V provides per-parameter breakdowns."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "The paper notes that 'mathematical problems are the most difficult to attack' (Section IV-B) but does not analyze specific failure examples or discuss why particular attacks fail. No error analysis or qualitative failure examples beyond this observation."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The ablation study (Table III) shows that random position injection and random component replacement perform substantially worse than the full method. Parameter sensitivity analysis (Figs. 5-6, Table V) shows configurations with poor performance."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims: (1) theoretical equivalence of KL-divergence and Mahalanobis distance (proved in Theorem 1/Appendix B), (2) effectiveness on seven LLM models and four datasets (Table I), (3) query-free black-box attack (the method design confirms this). All claims are supported by results."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper makes causal claims through ablation studies (Table III): removing the guided component selection and replacing with random selection degrades ASR. This controlled manipulation adequately supports the causal claim that the guided approach improves attack success."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title claims attacks 'on Large Language Models' generally, but experiments cover only 7 specific models from two families (GPT and Llama-2) on Q&A tasks only. No discussion of whether results generalize to other model families (Claude, Gemini, Mistral), other task types (summarization, translation, coding), or non-English languages."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No alternative explanations are discussed for the attack's effectiveness. For instance, the paper doesn't consider whether the success is due to the Gaussian assumption being approximately correct, or whether simpler mechanisms (e.g., distraction from irrelevant text) explain the same results."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper measures ASR (whether the model gives incorrect answers after injection) and claims attack effectiveness. The measurement directly matches the claim — no proxy gap exists. The formal definition of ASR in Section IV-A3 is precise."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Specific model versions are provided: 'text-davinci-003', 'gpt-3.5-turbo-0125', 'gpt-4-0613', 'gpt-4-0125-preview', 'llama-2-7b-chat', 'llama-2-13b-chat', 'llama-2-70b-chat' (Table I, Section IV-A)."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "The paper describes the attack pipeline conceptually (Fig. 3) and shows examples of injected sentences (Table IV), but does not provide the actual prompt template sent to GPT-4-Turbo for generating adversarial sentences. The reader cannot reconstruct the exact prompts used."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "The method hyperparameters are stated (ε=0.2, δ=0.05, γ=0.5, random number range 10-100), but LLM API settings (temperature, top-p, max tokens) for the auxiliary GPT-4-Turbo model and for the victim models are not reported. These settings significantly affect output."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. The attack pipeline is a fixed sequence of POS extraction, synonym lookup, sentence generation, and constraint checking — not an agentic workflow."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper states '300 examples were randomly selected' per dataset but does not describe how random selection was performed, whether any filtering was applied, or how questions were formatted for input to the models."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "There is no dedicated limitations, threats-to-validity, or discussion section. The paper goes directly from ablation studies (Section V) to conclusion (Section VI) without discussing limitations."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No threats to validity are discussed anywhere in the paper. The Gaussian distribution assumption underlying the entire theoretical framework is acknowledged as unverifiable ('We cannot confirm whether these assumptions hold', Appendix A) but no specific threats to the experimental validity are identified."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No explicit scope boundaries are stated. The paper does not identify what the results do NOT show — e.g., that results are limited to Q&A tasks, English text, specific model families, or the particular injection strategy tested."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "The generated adversarial examples, model outputs, and raw experimental data are not made available. Only aggregated metrics (Aclean, Aattack, ASR) are reported in tables."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section IV-A2 describes the four datasets used (GSM8K, Web-based QA, MATH, SQuAD2.0) with citations and brief descriptions. Section IV-A states 300 examples were randomly selected from each."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. Data sources are standard public benchmarks (GSM8K, SQuAD2.0, MATH, Web-based QA)."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The attack generation pipeline is described (POS extraction → synonym selection → sentence generation → constraint checking), but the end-to-end data pipeline from raw dataset selection through to metric computation is not documented with filtering counts or intermediate step details."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Funding is disclosed in the footnote: 'This work was partially supported by Research Development Fund with No. RDF-22-01-020, the Qing Lan Project in Jiangsu universities and National Natural Science Foundation of China under Grant U1804159.'"
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Xi'an Jiaotong-Liverpool University and University of Liverpool. These are academic institutions not affiliated with any evaluated product."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Funding comes from academic sources (university research fund, Qing Lan Project, NSFC). None of the funders have a financial interest in the outcome of LLM attack research."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is included in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No training data cutoff dates are stated for any of the 7 models tested. The paper uses GSM8K, SQuAD2.0, MATH, and Web-based QA — all public benchmarks that could be in the training data — without stating when model training data was collected."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of whether the benchmark examples appeared in model training data. GSM8K, SQuAD2.0, and MATH are widely known benchmarks that GPT-4 and Llama-2 may have been trained on."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "All four benchmarks (GSM8K 2021, SQuAD2.0 2018, MATH 2021, Web-based QA 2022) were published before the models' training periods. No contamination analysis or decontamination is mentioned."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "The abstract claims 'low computational cost' and the method is described as 'query-free', but no actual cost figures are provided — no API costs, tokens consumed, wall-clock time, or cost per attack example."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No computational budget is stated. The experiments involve calling GPT-4-Turbo to generate adversarial text and testing 7 models × 4 datasets × 300 examples, but total API spend, GPU hours, or hardware are not reported."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No mention of random seeds or sensitivity analysis across seeds. The method involves random synonym selection and random number generation (Nt' between 10-100) but results for different seeds are not reported."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is not stated. It is unclear whether each configuration was tested once or multiple times. Section III-F mentions iterating N times to generate adversarial text but N is not specified."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "Table V shows a parameter grid for (ε, γ) but does not report the total compute budget for this search. Nine values are tested for each parameter but total search cost is unstated."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": true,
    318         "justification": "Table V shows ASR for all (ε, γ) combinations tested on GSM8K with GPT-3.5, and the best configuration (ε=0.2, γ=0.5) is selected based on highest ASR. The full grid is transparent."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons. The paper compares 7 methods across 4 datasets without any statistical testing."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors compare their system against baselines reported from PromptBench (Table II) and their own ablations, but do not acknowledge self-comparison bias or discuss whether their implementation of baselines might differ from original results."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "No performance-vs-compute analysis is provided. The paper claims to be 'query-free' with 'low computational cost' but does not compare compute requirements against query-dependent baselines like BertAttack or TextFooler."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No discussion of whether ASR on Q&A benchmarks actually measures what is claimed about prompt injection attack effectiveness in real-world settings. The paper does not question whether benchmark performance translates to practical attack capability."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The attack uses GPT-4-Turbo as an auxiliary model for generating adversarial text. The paper does not discuss how the choice of auxiliary model affects results or whether using a different auxiliary model would change attack effectiveness."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "Not discussed. All benchmarks (SQuAD2.0 from 2018, MATH from 2021, GSM8K from 2021) predate the models tested, meaning clean accuracy baselines could be inflated by training contamination."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "Not discussed. The evaluation setup where the full question is provided to the model is standard, but no analysis of whether model familiarity with benchmark format constitutes feature leakage."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "Not discussed. No analysis of whether the 300 randomly selected examples per dataset are independent or share structural similarities that could bias results."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection or prevention methods are used. No canary strings, membership inference, or decontamination pipelines."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "G2PIA achieves higher attack success rates than existing black-box attack methods on both SQuAD2.0 and Math datasets",
    372       "evidence": "Table II shows G2PIA achieves 79.50% ASR on SQuAD2.0 (vs BertAttack 65.33%, TextFooler 78.59%) and 44.87% ASR on Math (vs BertAttack 33.46%). Tested against 6 baseline methods on ChatGPT-3.5.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "Maximizing KL-divergence between conditional distributions of clean and adversarial text is equivalent to maximizing Mahalanobis distance under Gaussian assumptions",
    377       "evidence": "Theorem 1 with full proof in Appendix B. The derivation shows KL(N1||N2) = 1/2 (z-x)^T Σ^{-1} (z-x) under the Gaussian assumption.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "The attack method is query-free with low computational cost",
    382       "evidence": "The method design uses an auxiliary model to generate adversarial text offline (Section III-D-F) without querying the victim model. However, no actual computational cost figures are reported.",
    383       "supported": "weak"
    384     },
    385     {
    386       "claim": "ChatGPT-4-Turbo attack model has the strongest transferability while Llama-2-7b has the weakest defensive ability",
    387       "evidence": "Figure 4 shows a transfer success rate heatmap across model pairs (Section V-A). Single heatmap without statistical tests or multiple runs.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Optimal attack parameters are ε=0.2 and γ=0.5",
    392       "evidence": "Table V shows ASR values for 18 (ε, γ) combinations on GSM8K with ChatGPT-3.5. The (0.2, 0.5) pair yields 46.35% ASR. Authors state 'This combination of parameters works very well when we apply it to other datasets' but do not show the full grid for other datasets.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Insertion position of injected text has minimal impact on attack performance",
    397       "evidence": "Table IV shows one example question with injection at 6 different positions, all producing wrong answers. However, this is a single example, not a systematic evaluation across the test set.",
    398       "supported": "weak"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "No error bars or variance reporting",
    404       "detail": "All experimental results across Tables I, II, III, and V are single-point estimates. With no variance information, it is impossible to assess whether observed differences between methods are meaningful or within noise. The attack involves random elements (synonym selection, random number generation) that would produce different results across runs."
    405     },
    406     {
    407       "flag": "No limitations section",
    408       "detail": "The paper has no limitations, threats-to-validity, or broader impact discussion. It moves directly from ablation studies to a brief conclusion without acknowledging any weaknesses."
    409     },
    410     {
    411       "flag": "Imperceptibility claimed but not tested",
    412       "detail": "Section III-A claims the attack is 'imperceptible' and that 'the model's active defense mechanisms make it difficult to detect the presence of our prompt injections.' This is never validated — no human evaluation of whether injected text is detectable, and no automated detection tests."
    413     },
    414     {
    415       "flag": "Factual error in dataset description",
    416       "detail": "Section IV-A2 describes GSM8K as 'consisting of 800 billion words' and 'the largest language model training resource available today.' GSM8K is actually a grade school math dataset with ~8.5K problems. The citation [27] is attributed to 'Brown et al.' which does not match the actual GSM8K paper by Cobbe et al. (2021). This suggests carelessness in the writing."
    417     },
    418     {
    419       "flag": "Parameter tuning on test data",
    420       "detail": "Parameters ε=0.2, γ=0.5 are selected by exhaustive grid search on GSM8K test data with GPT-3.5 (Table V), then the same parameters are applied to report 'main results' that include GSM8K. No held-out validation set is used, risking overfitting to the evaluation data."
    421     },
    422     {
    423       "flag": "Insertion position claim based on single example",
    424       "detail": "Table IV claims insertion position has 'minimal impact on attack performance' but shows only one question with different insertion positions. A single example cannot support this general claim."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Survey of vulnerabilities in large language models revealed by adversarial attacks",
    430       "authors": ["E. Shayegani", "M. A. A. Mamun", "Y. Fu", "P. Zaree", "Y. Dong", "N. Abu-Ghazaleh"],
    431       "year": 2023,
    432       "arxiv_id": "2310.10844",
    433       "relevance": "Comprehensive survey of LLM adversarial vulnerabilities, directly relevant to understanding the attack landscape."
    434     },
    435     {
    436       "title": "Ignore previous prompt: Attack techniques for language models",
    437       "authors": ["F. Perez", "I. Ribeiro"],
    438       "year": 2022,
    439       "arxiv_id": "2211.09527",
    440       "relevance": "Foundational work on prompt injection attacks defining target hijacking and prompt leakage goals."
    441     },
    442     {
    443       "title": "Prompt injection attack against LLM-integrated applications",
    444       "authors": ["Y. Liu", "G. Deng", "Y. Li", "K. Wang", "T. Zhang", "Y. Liu", "H. Wang", "Y. Zheng", "Y. Liu"],
    445       "year": 2023,
    446       "arxiv_id": "2306.05499",
    447       "relevance": "Studies LLM sensitivity to escape characters and delimiters as implicit prompt injection vectors."
    448     },
    449     {
    450       "title": "Gradient-based adversarial attacks against text transformers",
    451       "authors": ["C. Guo", "A. Sablayrolles", "H. Jégou", "D. Kiela"],
    452       "year": 2021,
    453       "arxiv_id": "2104.13733",
    454       "relevance": "Proposes GBDA white-box attack using Gumbel-Softmax, a key baseline for adversarial text attacks on transformers."
    455     },
    456     {
    457       "title": "BERT-ATTACK: adversarial attack against BERT using BERT",
    458       "authors": ["L. Li", "R. Ma", "Q. Guo", "X. Xue", "X. Qiu"],
    459       "year": 2020,
    460       "relevance": "Context-aware word replacement attack using masked language models, one of the baselines compared in this paper."
    461     },
    462     {
    463       "title": "PromptBench: towards evaluating the robustness of large language models on adversarial prompts",
    464       "authors": ["K. Zhu", "J. Wang", "J. Zhou", "Z. Wang", "H. Chen", "Y. Wang", "L. Yang", "W. Ye", "N. Z. Gong", "Y. Zhang"],
    465       "year": 2023,
    466       "arxiv_id": "2306.04528",
    467       "relevance": "Benchmark framework for evaluating LLM robustness to adversarial prompts; provides baseline attack methods compared against in this paper."
    468     },
    469     {
    470       "title": "Universal adversarial triggers for attacking and analyzing NLP",
    471       "authors": ["E. Wallace", "S. Feng", "N. Kandpal", "M. Gardner", "S. Singh"],
    472       "year": 2021,
    473       "arxiv_id": "1908.07125",
    474       "relevance": "Proposes gradient-guided universal adversarial triggers for NLP models, foundational work on trigger-based attacks."
    475     },
    476     {
    477       "title": "TextAttack: a framework for adversarial attacks, data augmentation, and adversarial training in NLP",
    478       "authors": ["J. X. Morris", "E. Lifland", "J. Y. Yoo", "J. Grigsby", "D. Jin", "Y. Qi"],
    479       "year": 2020,
    480       "arxiv_id": "2005.05909",
    481       "relevance": "Unified framework for adversarial attacks and defenses in NLP, relevant to attack tooling and methodology."
    482     },
    483     {
    484       "title": "MathAttack: attacking large language models towards math solving ability",
    485       "authors": ["Z. Zhou", "Q. Wang", "M. Jin", "J. Yao", "J. Ye", "W. Liu", "W. Wang", "X. Huang", "K. Huang"],
    486       "year": 2023,
    487       "arxiv_id": "2309.01686",
    488       "relevance": "Domain-specific attack on LLM mathematical reasoning; directly compared with G2PIA on GSM8K."
    489     },
    490     {
    491       "title": "Is BERT really robust? A strong baseline for natural language attack on text classification and entailment",
    492       "authors": ["D. Jin", "Z. Jin", "J. T. Zhou", "P. Szolovits"],
    493       "year": 2020,
    494       "relevance": "Proposes TextFooler attack method, one of the key baselines compared in this paper."
    495     },
    496     {
    497       "title": "GPT-4 technical report",
    498       "authors": ["OpenAI"],
    499       "year": 2023,
    500       "relevance": "Technical report for GPT-4, one of the victim models evaluated in the attack experiments."
    501     },
    502     {
    503       "title": "Llama: Open and efficient foundation language models",
    504       "authors": ["H. Touvron", "T. Lavril", "G. Izacard"],
    505       "year": 2023,
    506       "arxiv_id": "2302.13971",
    507       "relevance": "Describes the Llama model family; Llama-2 variants are used as victim models in the experiments."
    508     }
    509   ],
    510   "engagement_factors": {
    511     "practical_relevance": {
    512       "score": 1,
    513       "justification": "The attack method requires BERT, word2vec, and an auxiliary LLM to generate adversarial text — not immediately usable without significant implementation effort, and no code is released."
    514     },
    515     "surprise_contrarian": {
    516       "score": 1,
    517       "justification": "The theoretical connection between KL-divergence and Mahalanobis distance is novel, but the overall finding that LLMs are vulnerable to prompt injection is well-established."
    518     },
    519     "fear_safety": {
    520       "score": 2,
    521       "justification": "Demonstrates a query-free generative prompt injection attack with up to 81% ASR on production models (GPT-4), raising concerns about LLM robustness in deployed systems."
    522     },
    523     "drama_conflict": {
    524       "score": 0,
    525       "justification": "No controversy, no accusations, no challenge to specific companies or claims."
    526     },
    527     "demo_ability": {
    528       "score": 0,
    529       "justification": "No code, demo, or tool is released; the method cannot be tried without reimplementation."
    530     },
    531     "brand_recognition": {
    532       "score": 1,
    533       "justification": "Attacks GPT-3.5/4 and Llama-2 (well-known models) but authors are from a lesser-known university without major industry affiliation."
    534     }
    535   }
    536 }

Impressum · Datenschutz