ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24285B)


      1 {
      2   "paper": {
      3     "title": "Automatic and Universal Prompt Injection Attacks against Large Language Models",
      4     "authors": [
      5       "Xiaogeng Liu",
      6       "Zhiyuan Yu",
      7       "Yizhe Zhang",
      8       "Ning Zhang",
      9       "Chaowei Xiao"
     10     ],
     11     "year": 2024,
     12     "arxiv_id": "2403.04957"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The abstract states 'Code is available at https://github.com/SheltonLiu-N/Universal-Prompt-Injection', providing a working repository URL."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available standard datasets (MRPC, Jfleg, HSOL, RTE, SST2, SMS Spam, Gigaword) which are all publicly accessible."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided in the paper. The paper mentions using Llama2-7b-chat but does not specify library versions or environment setup details."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "While code is released and hyperparameters are listed in Section 3.1, the paper does not provide step-by-step reproduction instructions or a README-style guide for replicating experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Tables 1 and 2 report only point estimates of attack success rates with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims their method outperforms baselines (e.g., 'an average 50% attack success rate' vs 0% for baselines) but provides no statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports concrete attack success rates with baseline context, e.g., baselines at 0% ASR vs their method at ~50% average, and '21% improvement' of M-GCG over GCG in Table 2. These provide sufficient context for effect magnitude."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper tests on 200 samples per dataset (1400 total) but provides no justification for why this sample size is sufficient. The training set of 5 samples is noted but not justified via power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or any measure of spread across runs is reported. It is unclear whether results are from single or multiple runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Three baselines are included: naive, combined (Liu et al., 2023c), and repeated (Toyer et al., 2023), described in Section 3.1 and Appendix B."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The baselines are from 2022-2023 (Toyer et al. 2023, Liu et al. 2023c), which are contemporary to this 2024 paper and represent state-of-the-art handcrafted prompt injection methods."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 3.3 presents an ablation comparing GCG (without momentum) vs M-GCG (with momentum), with loss curves in Figure 4 and quantitative results in Table 2."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Two metrics are used: keyword-evaluation ASR (KEY-E) and LLM-evaluation ASR (LM-E), as defined in Section 3.1."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation is performed. Attack success is measured only by keyword matching and GPT-4 automated evaluation. Human evaluation of attack outputs would be relevant to validate whether attacks actually deceive users."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper trains on only 5 samples and tests on 200 samples per dataset (1400 total). Additionally, two datasets (Spam Detection and Summarization, marked with * in Table 1) were not used during training, serving as held-out evaluation tasks."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 1 provides per-dataset breakdowns across all 7 datasets and all 3 attack objectives, rather than just aggregate averages."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 5 discusses the limitation that their method is weak against PPL detection defense (Alon & Kamfonas, 2023). Section 3.2 notes interesting variation patterns, e.g., summarization is hardest for static but easiest for semi-dynamic."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports low success rates for certain objectives/datasets (e.g., 0.10 for semi-dynamic on Natural Language Inference, 0.10 for dynamic on Spam Detection). The PPL defense limitation is also reported as a negative result in Section 5."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 'superior performance compared with baselines' with 'only five training samples (0.3% relative to the test data)'. Table 1 supports this: baselines achieve 0% ASR while the proposed method achieves substantial rates. The abstract also claims effectiveness 'even in the face of defensive measures', supported by Figure 5."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The ablation study in Section 3.3 compares GCG vs M-GCG with controlled single-variable manipulation (adding momentum), providing adequate evidence for the causal claim that momentum enhances convergence and performance (Table 2)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims 'Universal Prompt Injection Attacks against Large Language Models' (plural), but experiments are conducted only on Llama2-7b-chat. No other LLMs are tested. The claim of universality across models is not supported."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for its results. For example, it does not consider whether the baselines' 0% success rate might be due to the evaluation protocol rather than inherent ineffectiveness, or whether the results are specific to Llama2's safety training."
    131       }
    132     },
    133     "setup_transparency": {
    134       "model_versions_specified": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The victim model is specified as 'Llama2-7b-chat' (Section 3.1), which is a specific model version. The LLM evaluator is specified as 'GPT-4-0613' in Appendix C."
    138       },
    139       "prompts_provided": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Appendix B provides the full text of baseline attack prompts for all three objectives. Appendix C provides the full LLM evaluator prompt. The optimization target format is described in Section 2.3 with examples."
    143       },
    144       "hyperparameters_reported": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section 3.1 reports: top-k=128, batch size=256, iterations=1000, momentum weight=1.0, injection content token length=150."
    148       },
    149       "scaffolding_described": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "This paper does not use agentic scaffolding. The attack is a gradient-based optimization method, not an agent-based system."
    153       },
    154       "data_preprocessing_documented": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The paper lists the datasets used but does not describe how data was preprocessed, what subset of each dataset was selected, or how the 200 test samples per dataset were chosen."
    158       }
    159     },
    160     "limitations_and_scope": {
    161       "limitations_section_present": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 5 is titled 'Conclusions, Limitation, and Future Work' and includes a dedicated paragraph on the weakness against PPL detection defense."
    165       },
    166       "threats_to_validity_specific": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The limitation discussion mentions only the PPL detection weakness. It does not discuss broader threats to validity such as single-model evaluation, potential overfitting to Llama2, or whether the evaluation protocol itself might bias results."
    170       },
    171       "scope_boundaries_stated": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The paper does not explicitly state what the results do NOT show. Despite testing only on Llama2-7b-chat, the title and claims suggest broad applicability to 'Large Language Models' without bounding the scope to the tested model."
    175       }
    176     },
    177     "data_integrity": {
    178       "raw_data_available": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No raw experimental outputs (model responses, per-example attack results) are released. Only aggregate ASR numbers are reported."
    182       },
    183       "data_collection_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 3.1 describes the datasets used and Appendix A describes how adversarial goals were generated using ChatGPT with a danger scale rating process."
    187       },
    188       "recruitment_methods_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No human participants are involved. The study uses standard NLP benchmark datasets."
    192       },
    193       "data_pipeline_documented": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The paper does not document the full pipeline from dataset selection to final evaluation results. How the 200 test samples per dataset were selected and how training samples were chosen is not explained."
    197       }
    198     },
    199     "conflicts_of_interest": {
    200       "funding_disclosed": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "No funding information or acknowledgments section is present in the paper."
    204       },
    205       "affiliations_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Author affiliations are listed: superscripts 1, 2, 3 are shown. Based on arXiv listing, the authors are from University of Wisconsin-Madison, Washington University in St. Louis, and Apple."
    209       },
    210       "funder_independent_of_outcome": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding source is disclosed, so independence cannot be assessed. One author appears to be affiliated with Apple, which develops LLM products, but this is not discussed."
    214       },
    215       "financial_interests_declared": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No competing interests or financial interests statement is present in the paper."
    219       }
    220     },
    221     "contamination": {
    222       "training_cutoff_stated": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The paper uses Llama2-7b-chat as the victim model but does not state its training data cutoff date, which is relevant for understanding whether the model may have seen the evaluation datasets during pre-training."
    226       },
    227       "train_test_overlap_discussed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No discussion of whether the NLP benchmark datasets (MRPC, SST2, RTE, etc.) appeared in Llama2's training data. Since these are well-known benchmarks, contamination is plausible."
    231       },
    232       "benchmark_contamination_addressed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The evaluation datasets (MRPC, SST2, RTE, etc.) were all published well before Llama2's training. The paper does not discuss whether the model's familiarity with these datasets affects the attack evaluation results."
    236       }
    237     },
    238     "human_studies": {
    239       "pre_registered": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "irb_or_ethics_approval": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "demographics_reported": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "inclusion_exclusion_criteria": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "randomization_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "blinding_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       },
    269       "attrition_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved in this study."
    273       }
    274     },
    275     "cost_and_practicality": {
    276       "inference_cost_reported": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No inference cost, latency, or API costs are reported despite the method requiring 1000 optimization iterations with gradient computation on an LLM."
    280       },
    281       "compute_budget_stated": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No GPU hours, hardware specifications, or total compute budget is stated, despite the method requiring gradient-based optimization on a 7B parameter model."
    285       }
    286     }
    287   },
    288   "claims": [
    289     {
    290       "claim": "With only five training samples (0.3% of test data), the proposed attack achieves superior performance compared with baselines.",
    291       "evidence": "Table 1 shows baselines (naive, combined, repeated) achieve 0% ASR across all datasets and objectives, while the proposed method achieves an average 81% KEY-E for static, 37%/35% KEY-E/LM-E for semi-dynamic, and 39%/34% KEY-E/LM-E for dynamic objectives.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "The momentum-enhanced optimization (M-GCG) improves convergence speed and solution quality compared to standard GCG.",
    296       "evidence": "Table 2 shows M-GCG achieves 0.81/0.37/0.39 vs GCG's 0.79/0.21/0.34 across static/semi-dynamic/dynamic objectives. Figure 4 shows loss curves with consistently faster convergence for M-GCG.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "The attack remains effective even against defense mechanisms.",
    301       "evidence": "Figure 5 shows that without adaptive strategy, defenses cause a 32% performance drop but the attack remains effective. With EOT adaptive scheme, performance recovers to 85% of the no-defense baseline (Section 3.4).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Existing defense mechanisms that were reported effective cannot mitigate the threat of prompt injection attacks.",
    306       "evidence": "Figure 5 demonstrates that all five defenses (paraphrasing, retokenization, data prompt isolation, instructional prevention, sandwich prevention) are bypassed to varying degrees. However, the PPL detection defense is acknowledged as effective in Section 5.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "The attack demonstrates universality across various user interactions and datasets.",
    311       "evidence": "Table 1 shows effectiveness on 7 datasets including 2 unseen during training (marked with *). However, only one model (Llama2-7b-chat) is tested, so universality across models is not demonstrated.",
    312       "supported": "weak"
    313     }
    314   ],
    315   "methodology_tags": [
    316     "benchmark-eval"
    317   ],
    318   "key_findings": "The paper introduces a momentum-enhanced gradient-based optimization method (M-GCG) for automatically generating universal prompt injection attacks against LLMs. Using only 5 training samples, the attack achieves an average 50% attack success rate across 7 NLP datasets and 3 attack objectives (static, semi-dynamic, dynamic), while all handcrafted baseline attacks achieve 0% under the same evaluation protocol. The attack also demonstrates resilience against 5 existing defense mechanisms, with adaptive strategies recovering 85% of original performance. A key limitation is that all experiments are conducted on a single model (Llama2-7b-chat), limiting generalizability claims.",
    319   "red_flags": [
    320     {
    321       "flag": "Single model evaluation with broad generalization claims",
    322       "detail": "The title claims 'Attacks against Large Language Models' (plural) but all experiments use only Llama2-7b-chat. No closed-source models (GPT-4, Claude) or other open models are tested. The universality claim is only validated across datasets and tasks, not across different LLMs."
    323     },
    324     {
    325       "flag": "No uncertainty quantification",
    326       "detail": "All results are point estimates with no error bars, confidence intervals, or indication of variance across runs. For a stochastic optimization process with random token sampling, this is a significant omission."
    327     },
    328     {
    329       "flag": "Baselines may be unfairly weak under the evaluation protocol",
    330       "detail": "All baselines achieve exactly 0% ASR across every dataset and objective. The paper's evaluation protocol focuses on 'malicious goals' rather than 'benign task-switching', which may structurally disadvantage handcrafted methods designed for a different threat model. The paper acknowledges this shift but does not test baselines with adaptations to the new protocol."
    331     },
    332     {
    333       "flag": "Missing compute costs",
    334       "detail": "Running 1000 gradient optimization iterations on a 7B parameter model is computationally expensive, but no hardware, runtime, or cost information is provided. This makes practical applicability assessment impossible."
    335     },
    336     {
    337       "flag": "Contamination risk unaddressed",
    338       "detail": "The evaluation uses well-known NLP benchmarks (MRPC, SST2, RTE, etc.) that may be in Llama2's training data. Whether the model's familiarity with these tasks affects attack success is not discussed."
    339     }
    340   ],
    341   "cited_papers": [
    342     {
    343       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    344       "authors": ["Andy Zou", "Zifan Wang", "J. Zico Kolter", "Matt Fredrikson"],
    345       "year": 2023,
    346       "arxiv_id": "2307.15043",
    347       "relevance": "Foundational work on GCG jailbreak attacks that this paper builds upon; core method for adversarial prompt optimization."
    348     },
    349     {
    350       "title": "Not what you've signed up for: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    351       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    352       "year": 2023,
    353       "arxiv_id": "2302.12173",
    354       "relevance": "Seminal work on indirect prompt injection attacks in LLM-integrated applications, defining the threat model this paper formalizes."
    355     },
    356     {
    357       "title": "Ignore Previous Prompt: Attack Techniques For Language Models",
    358       "authors": ["F\u00e1bio Perez", "Ian Ribeiro"],
    359       "year": 2022,
    360       "arxiv_id": "2211.09527",
    361       "relevance": "Pioneering academic study of prompt injection, introducing goal hijacking and prompt leaking taxonomies."
    362     },
    363     {
    364       "title": "Tensor Trust: Interpretable Prompt Injection Attacks from an Online Game",
    365       "authors": ["Sam Toyer", "Olivia Watkins", "Ethan Adrian Mendes", "Justin Svegliato", "Luke Bailey", "Tiffany Wang", "Isaac Ong", "Karim Elmaaroufi", "Pieter Abbeel", "Trevor Darrell", "Alan Ritter", "Stuart Russell"],
    366       "year": 2023,
    367       "arxiv_id": "2311.01011",
    368       "relevance": "Large-scale crowdsourced prompt injection study that provides one of the baselines evaluated in this paper."
    369     },
    370     {
    371       "title": "Prompt Injection Attacks and Defenses in LLM-Integrated Applications",
    372       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    373       "year": 2023,
    374       "arxiv_id": "2310.12815",
    375       "relevance": "Proposes the combined prompt injection attack and benchmark used as the primary baseline in this paper."
    376     },
    377     {
    378       "title": "Prompt Injection attack against LLM-integrated Applications",
    379       "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li", "Kailong Wang", "Tianwei Zhang", "Yepang Liu", "Haoyu Wang", "Yan Zheng", "Yang Liu"],
    380       "year": 2023,
    381       "arxiv_id": "2306.05499",
    382       "relevance": "Framework for prompt injection attacks applied to study 36 LLM-integrated applications, identifying 31 as vulnerable."
    383     },
    384     {
    385       "title": "Jatmo: Prompt Injection Defense by Task-Specific Finetuning",
    386       "authors": ["Julien Piet", "Maha Alrashed", "Chawin Sitawarin", "Sizhe Chen", "Zeming Wei", "Elizabeth Sun", "Basel Alomair", "David Wagner"],
    387       "year": 2024,
    388       "arxiv_id": "2312.17673",
    389       "relevance": "Defense mechanism against prompt injection through task-specific finetuning; relevant to the defense evaluation in this paper."
    390     },
    391     {
    392       "title": "Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models",
    393       "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Keegan Hines", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"],
    394       "year": 2023,
    395       "arxiv_id": "2312.14197",
    396       "relevance": "Benchmarks indirect prompt injection attacks and proposes defenses; closely related evaluation of LLM vulnerability to injection."
    397     },
    398     {
    399       "title": "Baseline defenses for adversarial attacks against aligned language models",
    400       "authors": ["Neel Jain", "Avi Schwarzschild", "Yuxin Wen", "Gowthami Somepalli", "John Kirchenbauer", "Ping-yeh Chiang", "Micah Goldblum", "Aniruddha Saha", "Jonas Geiping", "Tom Goldstein"],
    401       "year": 2023,
    402       "relevance": "Proposes paraphrasing and retokenization defenses evaluated in this paper's Section 3.4."
    403     },
    404     {
    405       "title": "AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models",
    406       "authors": ["Xiaogeng Liu", "Nan Xu", "Muhao Chen", "Chaowei Xiao"],
    407       "year": 2023,
    408       "arxiv_id": "2310.04451",
    409       "relevance": "Related automated jailbreak generation method from some of the same authors; relevant to automated adversarial attack generation."
    410     },
    411     {
    412       "title": "TrustLLM: Trustworthiness in Large Language Models",
    413       "authors": ["Lichao Sun", "Yue Huang", "Haoran Wang", "Siyuan Wu", "Qihui Zhang", "Chujie Gao", "Yixin Huang", "Wenhan Lyu", "Yixuan Zhang", "Xiner Li"],
    414       "year": 2024,
    415       "arxiv_id": "2401.05561",
    416       "relevance": "Comprehensive trustworthiness benchmark for LLMs covering safety, robustness, and other dimensions relevant to this survey."
    417     },
    418     {
    419       "title": "Assessing Prompt Injection Risks in 200+ Custom GPTs",
    420       "authors": ["Jiahao Yu", "Yuhang Wu", "Dong Shu", "Mingyu Jin", "Xinyu Xing"],
    421       "year": 2023,
    422       "arxiv_id": "2311.11538",
    423       "relevance": "Large-scale empirical assessment of prompt injection vulnerability in deployed GPT applications."
    424     }
    425   ]
    426 }

Impressum · Datenschutz