ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (31826B)


      1 {
      2   "paper": {
      3     "title": "Neural Exec: Learning (and Learning from) Execution Triggers for Prompt Injection Attacks",
      4     "authors": [
      5       "Dario Pasquini",
      6       "Martin Strohmeier",
      7       "Carmela Troncoso"
      8     ],
      9     "year": 2024,
     10     "venue": "AISec@CCS",
     11     "arxiv_id": "2403.03792",
     12     "doi": "10.1145/3689932.3694764"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval", "case-study"],
     17   "key_findings": "The paper introduces Neural Exec, optimization-based execution triggers for prompt injection attacks that are 200-500% more effective than handcrafted triggers across four open-source LLMs (Llama-3-8B, Mistral-7B, OpenChat3.5, Mixtral-8x7B). The triggers can persist through RAG preprocessing pipelines with ~80% success at default chunk sizes, thanks to inlining and Semantic-Oblivious Injection properties. Analysis reveals a large and diverse space of valid triggers that deviate markedly from known attack patterns, undermining blacklist-based detection strategies. Bootstrapped Neural Execs exhibit improved transferability across models at a small cost in target-model performance.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract states 'Code available at https://github.com/pasquini-dario/LLM_NeuralExec' providing a working GitHub URL."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses publicly available datasets: SQuAD 2.0 for guide-text and queries, and the Alpaca instruction set (~50,000 instructions) for payloads. Both are standard public datasets."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is mentioned in the paper. Model names are given but dependency versions are not listed."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper describes the optimization process in detail (Section 4.3) but does not provide step-by-step reproduction instructions or a README with commands to run experiments."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Figure 6 shows error bars for baselines (min/max across 12 handcrafted triggers), but these represent variation across different trigger designs, not repeated runs. Neural Exec results are point estimates from a single trigger per model with no uncertainty quantification."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims Neural Exec triggers are '200% to 500% more effective' and 'at least twice as effective' based solely on comparing execution accuracy numbers without any statistical significance tests."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Effect sizes are provided with baseline context: e.g., 'accuracy gain achieved by the Neural Exec trigger reaches an increase of 500% compared to the best-performing baseline trigger' (Section 5.2), and Figure 6 shows absolute accuracy values (e.g., ~91% vs baselines) enabling magnitude assessment."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The test set of 100 prompts and payloads is used without justification for why 100 is sufficient. The RAG evaluation uses 500 repetitions (Appendix B) but again without power analysis or justification."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Only one Neural Exec trigger is generated per model and evaluated. No variance across multiple optimization runs or random seeds is reported for the main results."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "12 handcrafted execution triggers from prior work are used as baselines, including triggers from Perez et al. [40], Liu et al. [35], and blog posts [16, 18]. The top-5 are listed in Figure 15."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The baselines include the most recent handcrafted triggers available at the time of writing, from Perez et al. (2022) and Liu et al. (2023), which represent the state of the art in handcrafted prompt injection triggers."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Multiple ablation-like experiments: prior-free vs. bootstrapped triggers (Section 5.2 vs 5.3), with vs without SOI optimization (Figure 8), different token budgets (15+5, 5+15, 30+10 in Figure 9), and inline vs non-inline configurations."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper uses ExeAcc (execution accuracy) with perfect/partial execution breakdown, plus RAG pipeline persistence rate (percentage of armed payloads surviving preprocessing) as distinct metrics."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "Attack success is evaluated using an LLM-based fuzzy matching function (Mixtral-8x7B as verifier, Appendix D), not human evaluation. No human judges assessed whether attacks were successful."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section 5.1 states: 'we use a set of 100 prompts and payloads produced according to Section 4.3.3 and completely disjoint from the training and validation set used for the Neural Exec triggers creation.'"
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down per model (4 LLMs in Figure 6), per trigger type (prior-free, bootstrapped, baseline), and for RAG experiments across chunk sizes (300-900) and k values (2-5) in Figure 8."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Partial executions are discussed and exemplified (Figure 22), limited transferability of prior-free Neural Execs is shown (Figure 21 and Appendix E), and the paper discusses failure modes in RAG (chunker breaking armed payloads, retrieval filtering)."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Several negative results: bootstrapped triggers 'tend to perform slightly worse than prior-free ones' (Section 5.3), prior-free Neural Execs 'exhibit limited transferability' (Appendix E), and SOI optimization 'tends to override the bootstrapping process' (Section 5.3)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims of 'drastically more effective' triggers are supported by Figure 6 showing 200-500% improvement. Claims about RAG robustness are supported by Figure 8. Claims about sidestepping blacklist detection are supported by Section 6.1's analysis of trigger diversity."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Causal claims like 'Neural Exec triggers achieve an improvement' are justified through controlled comparisons: same test set, same models, same evaluation metric, varying only the trigger type. Ablation-style experiments (with/without SOI, bootstrapped vs prior-free) use controlled single-variable manipulation."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The threat model (Section 3) explicitly bounds to 'the setting where the target model is an open-source language model for which white-box access is available.' The conclusion acknowledges 'additional investigations are necessary to fully comprehend the feasibility of generating Neural Execs for closed-source LLMs.'"
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No discussion of alternative explanations for the observed effectiveness improvements. The paper does not consider confounds such as token budget differences between Neural Exec and baselines (Neural Exec is inline while baselines use newlines consuming tokens), or whether the LLM-based evaluator may be biased toward certain trigger formats."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper carefully defines ExeAcc (Section 5.1.1), distinguishes between 'perfect execution' and 'partial execution', and acknowledges that fuzzy matching via LLM is a proxy by detailing its implementation (Appendix D) and validation process."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Specific model versions are given with HuggingFace links: 'Llama-3-8B-Instruct', 'Mistral-7B-Instruct-v0.2', 'OpenChat3.5', 'Mixtral-8x7B' (Section 5.1), each with a reference to the exact HuggingFace model card."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Extensive prompt text is provided: the fuzzy matching prompt template (Figure 19), prompt template examples (Figures 3, 12, 23), actual generated Neural Exec triggers (Figures 5, 7, 9, 10), and the list of tasks for prompt generation (Figure 17). Code release includes the generation process."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "Temperature is stated as 0 for evaluation. The optimization uses 150-250 iterations, positional exponential decay weighting, and token budgets (15+5 default). However, key GCG hyperparameters are not specified: batch size k, number of candidate substitutions K, top-m vocabulary pool size, and the annealing schedule for e."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. The method is a direct optimization pipeline producing adversarial strings."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Appendix A details prompt generation from five modular components (prompt class, task, system prompt, guide-text, payload). Payload filtering criteria are stated (>150 characters). Appendix B describes the RAG simulation setup. SQuAD 2.0 and Alpaca dataset usage is documented."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No dedicated limitations section. Section 7 ('Conclusion and Future Work') briefly mentions the black-box setting as future work but does not substantively discuss limitations of the current approach."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No threats to validity are discussed. There is no mention of potential issues with the LLM-based evaluator reliability, the representativeness of the test set, or the generalizability of the optimization approach."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 3 explicitly states: 'We focus on the setting where the target model is an open-source language model for which white-box access is available to A.' Section 7 states: 'additional investigations are necessary to fully comprehend the feasibility of generating Neural Execs for closed-source LLMs.'"
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "Individual attack outcomes (per prompt/payload results) are not released. Only aggregate execution accuracy values are reported in figures."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Data sources are described: Alpaca instruction set for payloads with filtering (>150 chars), SQuAD 2.0 for guide-text and queries, and the modular prompt generation process is detailed in Appendix A."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data sources are standard public benchmarks (SQuAD 2.0, Alpaca)."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The pipeline is documented from dataset selection through prompt generation (Appendix A), trigger optimization (Section 4.3), to evaluation (Section 5.1). The RAG simulation pipeline is documented in Appendix B with 500 repetitions per configuration."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding acknowledgment or grant information is present in the paper. Author affiliations include EPFL (academic) and armasuisse (Swiss defense), but no explicit funding disclosure."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: 'SPRING Lab, EPFL' (Pasquini and Troncoso) and 'armasuisse' (Strohmeier). These are not companies whose products are being evaluated."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funder is disclosed, making it impossible to assess independence. armasuisse is a Swiss defense procurement agency that could have interest in understanding LLM attack vectors, but no explicit funding relationship is stated."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement or financial interest declaration appears in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "This paper tests adversarial attack effectiveness against LLMs, not model capabilities on a knowledge benchmark. The models' training cutoff is irrelevant to whether adversarial triggers can manipulate behavior."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "The study evaluates adversarial prompt injection attacks, not model knowledge on benchmarks. Train/test overlap of the LLM's pretraining data is not a relevant concern for attack effectiveness."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "This is an adversarial attack study, not a capability benchmark evaluation. Benchmark contamination does not apply to measuring whether adversarial triggers can hijack model behavior."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference cost, API cost, or wall-clock time is reported for either trigger generation or attack evaluation. The optimization requires 150-250 iterations with gradient computation but the actual time/cost is not stated."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No GPU type, total GPU hours, or computational budget is stated for the optimization process, despite requiring white-box access and gradient computation on models up to 47B parameters (Mixtral-8x7B)."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "Section 6.1 mentions that 'the only difference in setup is the random seed' when comparing two triggers, but does not systematically report attack success across multiple seeds. Only one trigger per model is evaluated in the main results."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "The test set size is stated as 100 prompts/payloads (Section 5.1). For RAG experiments, 'We repeat the procedure 500 times for each trigger and setup' (Appendix B). Optimization uses a validation set of 100 samples."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No hyperparameter search budget is reported. Key optimization parameters (batch size k, candidate count K, vocabulary pool m, token swap count e) are described but their values and any tuning process are not stated."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "The optimization uses a validation set: 'we select a sample of 100 prompts and payloads to serve as a validation set. We iterate on the optimization as long as there is a consistent reduction in the average loss on this set' (Section 4.3.2). Test evaluation is on a disjoint set."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors propose Neural Exec and compare against baselines they implemented. No discussion of whether their implementation of baseline triggers may differ from original implementations or whether self-evaluation bias affects the comparison."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "Neural Exec requires GPU-based optimization with gradient computation (150-250 iterations on large LLMs), while baselines are zero-cost handcrafted strings. This fundamental compute asymmetry is never discussed or accounted for."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "The paper defines ExeAcc carefully (Section 5.1.1), distinguishes perfect from partial execution, details the LLM-based fuzzy matching implementation (Appendix D with validation on 20 pairs), and discusses what constitutes a successful attack. The proxy measure is well-specified relative to the claim."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved. The method directly generates adversarial strings without an agentic scaffold."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "SQuAD 2.0 (published 2018) and Alpaca (2023) are used in the test set. The tested models (e.g., Llama-3, 2024) may have seen SQuAD in training, which could affect how models process prompts containing SQuAD text. This is not discussed."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup (using SQuAD-based prompts with known answer patterns) could leak information that affects attack success measurement."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "Section 5.1 states the test set is 'completely disjoint from the training and validation set used for the Neural Exec triggers creation,' addressing independence between optimization and evaluation data."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No concrete leakage detection method is applied. The disjointness of train/test splits is stated but not verified with overlap analysis or other detection methods."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "Neural Exec triggers achieve 200-500% improvement in effectiveness over handcrafted triggers in targeted attack scenarios.",
    369       "evidence": "Figure 6 shows execution accuracy across four LLMs. Neural Exec achieves ~91% average accuracy. The 500% gain is observed on OpenChat3.5 where baseline best is ~15% and Neural Exec reaches ~90% (Section 5.2).",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "Neural Exec triggers persist through RAG preprocessing pipelines with ~80% success at default chunk size of 500 characters.",
    374       "evidence": "Figure 8 shows persistence rates across chunk sizes and k values for a simulated RAG pipeline (Section 5.4, setup in Appendix B with 500 repetitions per configuration).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "SOI-optimized Neural Execs outperform non-SOI versions in RAG persistence, especially at small chunk sizes and low k values.",
    379       "evidence": "Figure 8 compares Neural Exec with and without SOI optimization, showing improved persistence at smaller chunk sizes. 'The improvement margin is marked for small chunk-size settings' (Section 5.4).",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "The space of valid execution triggers is large and diverse — triggers generated under identical conditions with different seeds share only 5% of common tokens.",
    384       "evidence": "Section 6.1 and Figure 9 show four triggers with equivalent average execution scores but substantial variation in composition. 'Two triggers created with a 15+5 token configuration... sharing only a 5% of common tokens.'",
    385       "supported": "weak"
    386     },
    387     {
    388       "claim": "Bootstrapped Neural Execs trade slight target-model performance for improved cross-model transferability.",
    389       "evidence": "Figure 6 shows bootstrapped triggers slightly underperform prior-free ones. Figure 21 shows bootstrapped OpenChat3.5 trigger outperforms baselines by 170% on other models (Appendix E).",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Neural Execs persist through fine-tuning of the target model, maintaining effectiveness above best baselines.",
    394       "evidence": "Figure 20 shows a prior-free Mistral-7B Neural Exec maintains effectiveness above baselines across 5 fine-tuned Mistral-7B derivatives (Appendix E.1).",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "Neural Exec-inspired handcrafted triggers outperform existing baselines and transfer across LLMs including ChatGPT-4.",
    399       "evidence": "Figure 11 shows the Neural Exec-inspired trigger (Figure 10) outperforming baselines on all five tested models including ChatGPT-4 (February), achieving approximately five times the success rate on ChatGPT-4 (Section 6.2.3).",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "LLM-as-judge evaluation",
    406       "detail": "Attack success is evaluated using Mixtral-8x7B as a fuzzy matching verifier (Appendix D). LLM-based evaluation can be unreliable and may systematically favor certain output patterns. The validation of this evaluator is minimal (20 manual pairs)."
    407     },
    408     {
    409       "flag": "Single trigger per model, no variance",
    410       "detail": "Only one Neural Exec trigger is generated and evaluated per model in the main experiments. With no variance across optimization runs, it is impossible to know whether the reported results are typical or lucky outcomes."
    411     },
    412     {
    413       "flag": "No statistical significance testing",
    414       "detail": "Claims of 200-500% improvement are made by comparing point estimates without any statistical tests, confidence intervals, or measures of variability across runs."
    415     },
    416     {
    417       "flag": "Compute asymmetry not acknowledged",
    418       "detail": "Neural Exec requires gradient-based optimization over 150-250 iterations on large LLMs, while baselines are zero-cost handcrafted strings. This fundamental cost disparity is never discussed or controlled for."
    419     },
    420     {
    421       "flag": "Simulated RAG evaluation",
    422       "detail": "The RAG robustness evaluation (Section 5.4) uses a simulated pipeline. Only one anecdotal proof-of-concept against HuggingChat is shown (Appendix C), which cannot be reproduced by others."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Ignore previous prompt: Attack techniques for language models",
    428       "authors": ["Fábio Perez", "Ian Ribeiro"],
    429       "year": 2022,
    430       "relevance": "Foundational formal study of prompt injection attacks in adversarial settings, providing baseline handcrafted triggers used for comparison."
    431     },
    432     {
    433       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    434       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    435       "year": 2023,
    436       "relevance": "Formalizes indirect prompt injection against LLM agents and surveys security implications of prompt injection in real-world applications."
    437     },
    438     {
    439       "title": "Prompt injection attack against llm-integrated applications",
    440       "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li", "Kailong Wang", "Tianwei Zhang", "Yepang Liu", "Haoyu Wang", "Yan Zheng", "Yang Liu"],
    441       "year": 2023,
    442       "relevance": "Systematic analysis of prompt injection attacks with handcrafted separator-based triggers, providing baselines and attack taxonomy used in this work."
    443     },
    444     {
    445       "title": "Universal and transferable adversarial attacks on aligned language models",
    446       "authors": ["Andy Zou", "Zifan Wang", "J. Zico Kolter", "Matt Fredrikson"],
    447       "year": 2023,
    448       "relevance": "Introduces the GCG algorithm for discrete adversarial optimization on LLMs, which Neural Exec adapts for generating execution triggers."
    449     },
    450     {
    451       "title": "Retrieval-augmented generation for knowledge-intensive nlp tasks",
    452       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"],
    453       "year": 2020,
    454       "relevance": "Foundational RAG framework paper; Neural Exec specifically targets RAG-based pipelines as a key attack vector."
    455     },
    456     {
    457       "title": "Jailbreaking black box large language models in twenty queries",
    458       "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban", "Hamed Hassani", "George J. Pappas", "Eric Wong"],
    459       "year": 2023,
    460       "relevance": "Black-box jailbreaking method that could be adapted for Neural Exec generation against closed-source LLMs."
    461     },
    462     {
    463       "title": "Pal: Proxy-guided black-box attack on large language models",
    464       "authors": ["Chawin Sitawarin", "Norman Mu", "David Wagner", "Alexandre Araujo"],
    465       "year": 2024,
    466       "relevance": "Black-box adversarial attack method based on GCG that could be adapted for Neural Exec generation against proprietary LLMs."
    467     },
    468     {
    469       "title": "Llama 2: Open foundation and fine-tuned chat models",
    470       "authors": ["Touvron et al."],
    471       "year": 2023,
    472       "relevance": "Widely-used open-source LLM family; Llama-3-8B-Instruct is one of the four target models in the evaluation."
    473     },
    474     {
    475       "title": "Are aligned neural networks adversarially aligned?",
    476       "authors": ["Nicholas Carlini", "Milad Nasr", "Christopher A. Choquette-Choo"],
    477       "year": 2023,
    478       "relevance": "Studies adversarial robustness of aligned LLMs, establishing that safety alignment does not prevent adversarial manipulation."
    479     },
    480     {
    481       "title": "Language models are few-shot learners",
    482       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    483       "year": 2020,
    484       "relevance": "GPT-3 paper establishing the foundation for modern LLM capabilities that prompt injection attacks exploit."
    485     },
    486     {
    487       "title": "Universal adversarial triggers for attacking and analyzing NLP",
    488       "authors": ["Eric Wallace", "Shi Feng", "Nikhil Kandpal", "Matt Gardner", "Sameer Singh"],
    489       "year": 2019,
    490       "relevance": "Early work on universal adversarial triggers for NLP systems, establishing optimization-based adversarial input generation."
    491     },
    492     {
    493       "title": "AutoPrompt: Eliciting knowledge from language models with automatically generated prompts",
    494       "authors": ["Taylor Shin", "Yasaman Razeghi", "Robert L. Logan IV", "Eric Wallace", "Sameer Singh"],
    495       "year": 2020,
    496       "relevance": "Gradient-based discrete prompt optimization method that informs the optimization approach used in Neural Exec."
    497     }
    498   ],
    499   "engagement_factors": {
    500     "practical_relevance": {
    501       "score": 2,
    502       "justification": "Security researchers and red-teamers can use the released code to test LLM applications, but the approach requires white-box model access and ML expertise."
    503     },
    504     "surprise_contrarian": {
    505       "score": 2,
    506       "justification": "Demonstrates that optimization-based triggers are far more effective than handcrafted ones and can evade existing blacklist defenses, challenging the assumption that prompt injection is a largely solved detection problem."
    507     },
    508     "fear_safety": {
    509       "score": 3,
    510       "justification": "Demonstrates a novel attack family that can bypass RAG preprocessing, evade detection systems, and inject arbitrary instructions including phishing and reverse shells (Figures 1, 18)."
    511     },
    512     "drama_conflict": {
    513       "score": 1,
    514       "justification": "No major controversy or conflict narrative; the paper is a technical contribution that advances adversarial capabilities without calling out specific vendors."
    515     },
    516     "demo_ability": {
    517       "score": 2,
    518       "justification": "Code is released on GitHub and a HuggingChat proof-of-concept is demonstrated, but running the optimization requires significant GPU compute."
    519     },
    520     "brand_recognition": {
    521       "score": 1,
    522       "justification": "EPFL is well-regarded in security research and armasuisse adds intrigue, but neither is a mainstream AI brand."
    523     }
    524   }
    525 }

Impressum · Datenschutz