ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32364B)


      1 {
      2   "paper": {
      3     "title": "May I have your Attention? Breaking Fine-Tuning based Prompt Injection Defenses using Architecture-Aware Attacks",
      4     "authors": [
      5       "Nishit V. Pandya",
      6       "Andrey Labunets",
      7       "Sicun Gao",
      8       "Earlence Fernandes"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2507.07417",
     13     "doi": "10.48550/arXiv.2507.07417"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "GCG-style optimization-based attacks do not significantly outperform unguided random search against fine-tuning-based prompt injection defenses like SecAlign, because the target logprobs loss landscape provides poor gradient signal. The proposed ASTRA attack, which targets attention matrices using gradient-derived sensitivity weights, achieves 72.5-82.5% attack success rates against SecAlign in the strong knowledge setting (vs 12.5-70% for GCG). ASTRA++, a universal variant, achieves up to 96% success on unseen contexts against SecAlign and 87% against SecAlign++, demonstrating that these defenses do not provide the claimed security properties under modest budget scaling to 35 tokens.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract states 'We release our code and attacks at this link,' indicating a code release. The link is present as a hyperlink in the original PDF though not visible in plain text extraction."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The evaluation uses publicly available datasets: AlpacaFarm evaluation set and Dolly fine-tuning dataset. The defended models were obtained from the authors of SecAlign/StruQ. No proprietary data was collected."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is described in the paper."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided in the paper. While implementation parameters are described (iterations, batch size, etc.), there is no README-style guide or script to replicate experiments."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Table 2 reports mean and standard deviation over 3 training runs for ASTRA++ results (e.g., '96% (5%)', '87% (12%)'). However, Table 1 (ASTRA strong knowledge) reports only point estimates without uncertainty measures."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No statistical significance tests are used. Claims that ASTRA outperforms GCG are based solely on comparing raw ASR numbers without any hypothesis testing."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Results are presented with baseline context throughout. Table 1 shows both GCG and ASTRA ASR side by side (e.g., GCG 12.5% vs ASTRA 20% at budget 20; GCG 70% vs ASTRA 82.5% at budget 35), allowing readers to assess effect magnitude."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper states 'Due to hardware limitations, we only report our attack success rate out of 40 examples that we could complete' (Section 6.1). This is an explanation, not a justification or power analysis. No discussion of whether 40 examples is sufficient for the claims made."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Table 2 reports standard deviations across 3 training runs for ASTRA++ (e.g., 'Mean (Std. Dev.)'). Section 4.2 also reports distributions of Dr across multiple runs (r=1 to r=5) with mean and standard deviation. Table 1 (ASTRA) does not report variance."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "GCG is used as the baseline throughout. Section 4.2 also compares guided (GCG) vs unguided random search. Section 6.2 compares ASTRA++ against Universal GCG."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "GCG (Zou et al., 2023) is the state-of-the-art white-box optimization algorithm for this task. The paper also references AdvPrompter and NeuralExec as contemporary approaches and positions GCG as the strongest available baseline."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 6.3 presents an ablation study on the weight selection for the attention loss function, comparing clipped sensitivities against uniform weights, first-layer-only, last-layer-only, and average sensitivities (Figure 7)."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The only evaluation metric is Attack Success Rate (ASR), defined as whether the output starts with 'Hacked' or 'hacked'. No secondary metrics (e.g., perplexity of generated text, attack transferability rate, or semantic similarity) are reported."
     89       },
     90       "human_evaluation": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Human evaluation is not relevant for this paper's claims. The evaluation criterion is binary (does the output start with 'Hacked'?) and fully automatable."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "For ASTRA++, 10 examples are used for training and the remaining 198 unseen examples form the test set (Section 6.2). For ASTRA, the 40 evaluation examples are not used in computing the sensitivity weights (those come from Dolly)."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down by model (Llama-3, Mistral), by attacker budget (20-40 tokens), by prefix/suffix configuration, and by defense (SecAlign, SecAlign++, StruQ). Tables 1 and 2 provide detailed per-condition breakdowns."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No qualitative analysis of failure cases is provided. The paper reports aggregate success rates but does not examine why specific attacks fail on certain examples or what properties of resistant examples look like."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 4.2 shows that GCG does not significantly outperform unguided search — a key negative finding. Table 2 shows high variance with some failed training runs (e.g., ASTRA++ at budget 30 has 48% std dev). Section 7.2 discusses performance limitations."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims 'success rates of up to 85-95%' which is supported by Table 2 showing 96% for SecAlign-Llama and 87% for SecAlign++. The claim that defenses 'do not provide the claimed security properties' is supported by the high ASR results across Tables 1 and 2."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper makes causal claims via controlled ablation (Section 6.3: removing/changing attention head weights affects performance). The two-phase design is tested against single-phase baselines with all other variables controlled. The ablation design is adequate (single-variable manipulation)."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper tests only 2 model architectures (Llama-3-8B and Mistral-7B) and 3 defenses (SecAlign, SecAlign++, StruQ) but makes broad claims about 'the class of fine-tuning based prompt injection defenses' (Section 9). The title claims to be 'Breaking Fine-Tuning based Prompt Injection Defenses' generically."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper does not substantively discuss alternative explanations for ASTRA's success. For example, the budget increase from 20 to 35 tokens alone might explain much of the improvement (Section 7.3 briefly notes 'even crude attacks can break defenses with increased budgets' but does not quantify this confound). No discussion of whether the warm-start effect is the primary driver vs the attention mechanism specifically."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper measures whether the output starts with 'Hacked' and frames this as successful prompt injection. Real-world prompt injection success involves more complex objectives (data exfiltration, tool misuse, dynamic targets). Section 7.2 briefly notes that dynamic prompt injections are not tested, but the gap between the 'Hacked' proxy and real attack success is not explicitly discussed."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Specific model versions are named: 'Meta-Llama-3-8B-Instruct' and 'Mistral-7B-Instruct-v0.1' (Section 6.1). SecAlign++ model is also specified."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The exact prompt template used is not provided. The paper describes the structure (AlpacaFarm format with 'instruction' and 'input' fields, payload at end) and payload text ('Print exactly Hacked!') but does not show the complete prompt text or chat template sent to the model."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 6.1 reports: 500 iterations (1000 for ASTRA++), B=512 forward passes per iteration, p=256 top candidates, 50 Dolly examples for sensitivity computation, bottom 75% heads dropped, temperature 0 for generation. ASTRA++ specifics in Section 6.2: 700 attention + 300 GCG steps, sensitivities refreshed every 50 iterations, 50% head clipping."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. The paper evaluates direct prompt injection attacks on LLMs without any scaffolding or agent framework."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The paper describes data preparation: 208 AlpacaFarm examples with non-empty 'input' field, 50 randomly chosen for evaluation, 40 completed. For ASTRA++: 10 randomly chosen training examples, 198 test examples. Payload injection placement described in detail."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 7.2 'Limitations of ASTRA' provides substantive discussion of memory footprint, performance overhead (1.7-2.5x slower than GCG), and the restriction to fixed target strings."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 7.2 identifies specific threats: ASTRA's memory-intensive operation causing 1.7-2.5x slowdown, evaluation limited to fixed target strings ('Hacked') not dynamic prompt injections, and hardware limitations restricting evaluation to 40 out of 208 examples."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 7.2 explicitly states what was NOT tested: dynamic prompt injections with variable target strings, and multi-modal models. Section 3 clearly delineates the threat model (whitebox adaptive adversary). However, the paper does not explicitly state which model architectures or defense types are excluded from its claims."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "Individual attack results per example, generated adversarial tokens, and per-example success/failure are not made available. Only aggregate success rates are reported in tables."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 6.1 describes how examples were selected: '50 randomly chosen examples from the 208 AlpacaFarm evaluation set which contain a non-empty input field.' Section 6.2 describes the ASTRA++ training set selection: '10 randomly chosen examples from the 208 AlpacaFarm examples.'"
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. The paper uses standard public benchmark data (AlpacaFarm, Dolly)."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The pipeline is documented: 208 AlpacaFarm examples → 50 randomly chosen → 40 completed (10 dropped due to hardware limitations). For ASTRA++: 10 training + 198 test, 3 training runs each. Injection format and placement described."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding source or acknowledgments section is present in the paper."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All four authors are listed with their affiliation: UC San Diego. They are not affiliated with the developers of the defenses being attacked (Meta for SecAlign, academic teams for StruQ)."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding is disclosed, so independence of funder cannot be assessed."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "This paper tests attacks against prompt injection defenses rather than evaluating a model's knowledge on a benchmark. The attack success does not depend on whether the model has seen AlpacaFarm during training."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "This paper tests defense robustness against adversarial attacks, not model capability on benchmark tasks. Contamination in the traditional sense is not applicable."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "The evaluation measures attack success (can the attack force the model to output 'Hacked'), not the model's pre-trained knowledge. Benchmark contamination is not relevant to this threat model."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Section 7.2 mentions ASTRA is '1.7-2.5x slower than GCG' but reports no absolute cost figures (GPU hours, wall-clock time, energy consumption). No per-example cost is reported."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No GPU type, total compute hours, or hardware specifications are reported. The paper mentions 'hardware limitations' prevented completing all examples but does not quantify the compute used."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "ASTRA++ reports results across 3 training runs with different training sets, but this tests training set sensitivity, not random seed sensitivity. ASTRA (strong knowledge) reports no seed sensitivity. Section 4.2 notes that 'the role played by the initializer and randomness are crucial' but does not systematically test seed sensitivity for the main attacks."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "Section 6.2 states '3 different training runs, each with a completely different set of 10 training examples.' Section 4.2 reports r=1 to r=5 runs for the GCG analysis. ASTRA strong knowledge appears to be single-run per example."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No hyperparameter search is described. Key hyperparameters (75% clipping threshold, p=256, B=512, 700/300 phase split for ASTRA++) appear chosen without stated justification or search budget."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Results are reported across all tested budget configurations (20, 25, 30, 35 tokens) in Tables 1 and 2, not just the best one. The ablation study in Section 6.3 shows all weight selection strategies tested."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors implement both ASTRA and the GCG baseline. They do not acknowledge or discuss the bias of implementing and tuning both the proposed method and the baseline. Prior work (Lucic et al. 2018) shows this systematically favors the proposed method."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "The paper carefully controls for compute budget: 'we control for all the following variables: attacker budget and configuration, attack initialization, number of optimization iterations, number of forward passes on each iteration, value of p' (Section 6.1). ASTRA and GCG use identical compute budgets per comparison."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The benchmark evaluates whether the model outputs 'Hacked' on AlpacaFarm examples. The paper does not discuss whether this construct (fixed target string on instruction-following examples) is representative of real-world prompt injection scenarios, which involve diverse objectives and contexts."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is involved. The attacks are applied directly to the model without any agentic scaffold."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The paper does not discuss whether the AlpacaFarm evaluation examples or Dolly training examples could have been present in the pre-training data of Llama-3 or Mistral, which could affect the baseline model behavior."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": true,
    353         "justification": "The paper explicitly distinguishes between strong knowledge (attacker knows context) and weak knowledge (attacker doesn't know context) assumptions (Section 3). ASTRA++ specifically addresses the more realistic weak knowledge setting where the attacker has no information about the conversation history."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "For ASTRA++, training and test examples are drawn from the same AlpacaFarm dataset (10 train, 198 test). The paper does not discuss whether these examples share structural similarities or whether this affects the universality claims."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No concrete leakage detection or prevention method is used. The paper does not employ canary strings, membership inference tests, or decontamination pipelines."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "GCG-style guided optimization does not significantly outperform unguided random search against SecAlign-defended models.",
    370       "evidence": "Section 4.2, Figures 1-2: Distribution of Dr (difference between guided and unguided loss) centered at 0 across 50 examples with r=1 to 5 runs. Average loss curves for guided and unguided optimization are nearly identical over 500 iterations.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "ASTRA achieves 82.5% ASR on SecAlign-Mistral and 72.5% on SecAlign-Llama with 35-token budget in the strong knowledge setting.",
    375       "evidence": "Table 1: Controlled comparison with GCG baseline across 40 examples, with all hyperparameters controlled.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "ASTRA++ achieves 96% mean ASR on unseen contexts against SecAlign-Llama with 35-token budget.",
    380       "evidence": "Table 2a: Mean 96% (std dev 5%) over 3 training runs on 198 unseen test examples. However, other configurations show high variance (e.g., budget 30: mean 28%, std 48%).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "ASTRA++ achieves 87% mean ASR against SecAlign++ (instruction as system) with 35-token budget.",
    385       "evidence": "Table 2b: Mean 87% (std dev 12%) over 3 training runs on 198 unseen examples against the production-grade instruction hierarchy model.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Clipped sensitivities outperform other weighting functions (uniform, first-layer-only, last-layer-only, average sensitivities) for the attention loss.",
    390       "evidence": "Section 6.3, Figure 7: Controlled ablation study on 40 examples showing loss curves for different weighting strategies, with clipped sensitivities achieving the lowest target logprobs.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Fine-tuning based prompt injection defenses (SecAlign, SecAlign++, StruQ) do not provide the claimed security properties in the whitebox setting.",
    395       "evidence": "Tables 1-2: High ASR across multiple defenses and models. However, this is tested with increased token budgets (up to 35 tokens vs original 20-token evaluation) and only on 2 model architectures.",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "Small evaluation sample size",
    402       "detail": "ASTRA strong knowledge evaluation uses only 40 out of 208 examples, with 10 dropped 'due to hardware limitations.' This represents ~19% of the evaluation set, raising concerns about selection effects if harder examples were disproportionately dropped."
    403     },
    404     {
    405       "flag": "High variance in ASTRA++ results",
    406       "detail": "Table 2 shows extreme variance across training runs: e.g., SecAlign-Llama at budget 30 has mean 28% with 48% std dev, and SecAlign++ (user) at budget 35 has mean 22% with 54% std dev. This suggests results are highly sensitive to training set selection, undermining reliability claims."
    407     },
    408     {
    409       "flag": "Budget increase confound",
    410       "detail": "The paper's strongest results use 35-token budgets while the original defenses were evaluated at 20 tokens. Section 7.3 acknowledges 'even crude attacks can break defenses with increased budgets,' but the contribution of budget increase vs the ASTRA mechanism is not quantified. At 20 tokens, ASTRA achieves only 20% and 75% on Llama and Mistral respectively."
    411     },
    412     {
    413       "flag": "No statistical significance tests",
    414       "detail": "All comparisons between ASTRA and GCG are based on raw ASR differences without any statistical testing. Given the small sample sizes (40 examples) and high variance (3 runs), observed differences may not be statistically significant."
    415     },
    416     {
    417       "flag": "Limited model diversity",
    418       "detail": "Only two base model architectures are tested (Llama-3-8B and Mistral-7B), both in the 7-8B parameter range. Generalization claims to 'the class of fine-tuning based prompt injection defenses' rest on a narrow architectural and scale diversity."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "Struq: Defending against prompt injection with structured queries",
    424       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    425       "year": 2024,
    426       "relevance": "One of the three fine-tuning-based prompt injection defenses attacked and evaluated in this paper."
    427     },
    428     {
    429       "title": "Secalign: Defending against prompt injection with preference optimization",
    430       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"],
    431       "year": 2025,
    432       "relevance": "Primary defense evaluated in this paper; shown to be vulnerable to ASTRA attacks with 82.5% ASR."
    433     },
    434     {
    435       "title": "Meta secalign: A secure foundation llm against prompt injection attacks",
    436       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "David Wagner", "Chuan Guo"],
    437       "year": 2025,
    438       "relevance": "Production-grade instruction hierarchy defense from Meta, attacked with ASTRA++ achieving 87% ASR."
    439     },
    440     {
    441       "title": "Universal and transferable adversarial attacks on aligned language models",
    442       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J. Zico Kolter", "Matt Fredrikson"],
    443       "year": 2023,
    444       "relevance": "Introduces GCG, the baseline optimization-based attack against which ASTRA is compared."
    445     },
    446     {
    447       "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks",
    448       "authors": ["Dario Pasquini", "Martin Strohmeier", "Carmela Troncoso"],
    449       "year": 2024,
    450       "relevance": "Prior work on universal prompt injection attacks using GCG variants; ASTRA++ compared to universal GCG equivalent."
    451     },
    452     {
    453       "title": "Automatic and universal prompt injection attacks against large language models",
    454       "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"],
    455       "year": 2024,
    456       "relevance": "Prior work on automated universal prompt injection; ASTRA addresses the same problem with architectural awareness."
    457     },
    458     {
    459       "title": "Imprompter: Tricking llm agents into improper tool use",
    460       "authors": ["Xiaohan Fu", "Shuheng Li", "Zihan Wang", "Yihao Liu", "Rajesh K. Gupta", "Taylor Berg-Kirkpatrick", "Earlence Fernandes"],
    461       "year": 2024,
    462       "relevance": "Demonstrates GCG-based prompt injection for data leakage in agentic systems; motivates stronger attacks like ASTRA."
    463     },
    464     {
    465       "title": "Fun-tuning: Characterizing the vulnerability of proprietary llms to optimization-based prompt injection attacks via the fine-tuning interface",
    466       "authors": ["Andrey Labunets", "Nishit Pandya", "Ashish Hooda", "Xiaohan Fu", "Earlence Fernandes"],
    467       "year": 2025,
    468       "relevance": "Prior work from same group on optimization-based prompt injection attacks via fine-tuning interfaces."
    469     },
    470     {
    471       "title": "The instruction hierarchy: Training llms to prioritize privileged instructions",
    472       "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"],
    473       "year": 2024,
    474       "relevance": "OpenAI's instruction hierarchy approach to prompt injection defense; SecAlign++ implements this paradigm."
    475     },
    476     {
    477       "title": "Obfuscated gradients give a false sense of security: Circumventing defenses to adversarial examples",
    478       "authors": ["Anish Athalye", "Nicholas Carlini", "David Wagner"],
    479       "year": 2018,
    480       "relevance": "Foundational adversarial ML work on evaluating defenses with strong adaptive attacks; directly motivates ASTRA's approach."
    481     },
    482     {
    483       "title": "AdvPrompter: Fast adaptive adversarial prompting for LLMs",
    484       "authors": ["Anselm Paulus", "Arman Zharmagambetov", "Chuan Guo", "Brandon Amos", "Yuandong Tian"],
    485       "year": 2025,
    486       "relevance": "Prior SOTA automated attack against which SecAlign claimed robustness; ASTRA surpasses its attack capability."
    487     },
    488     {
    489       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    490       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    491       "year": 2023,
    492       "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications."
    493     },
    494     {
    495       "title": "AttGCG: Enhancing jailbreaking attacks on LLMs with attention manipulation",
    496       "authors": ["Zijun Wang", "Haoqin Tu", "Jieru Mei", "Bingchen Zhao", "Yisen Wang", "Cihang Xie"],
    497       "year": 2024,
    498       "relevance": "Prior work on attention-based attacks for jailbreaking; ASTRA differs in using attention as warm start with sensitivity-weighted heads."
    499     }
    500   ],
    501   "engagement_factors": {
    502     "practical_relevance": {
    503       "score": 2,
    504       "justification": "Security researchers evaluating prompt injection defenses could directly apply ASTRA's methodology; code is released."
    505     },
    506     "surprise_contrarian": {
    507       "score": 2,
    508       "justification": "Challenges the ~100% attack resistance claimed by SecAlign and StruQ, showing these defenses are breakable at 85-95% with modest budget increases."
    509     },
    510     "fear_safety": {
    511       "score": 3,
    512       "justification": "Demonstrates that a major class of prompt injection defenses (fine-tuning-based) can be defeated, raising direct concerns about deployed LLM security."
    513     },
    514     "drama_conflict": {
    515       "score": 2,
    516       "justification": "Directly challenges security claims of Meta's SecAlign++ and recent CCS/USENIX publications, framing their evaluations as insufficient."
    517     },
    518     "demo_ability": {
    519       "score": 1,
    520       "justification": "Code is released but requires whitebox model access, significant GPU resources, and ML expertise to reproduce."
    521     },
    522     "brand_recognition": {
    523       "score": 1,
    524       "justification": "Academic paper from UC San Diego; targets Meta's SecAlign++ which adds some brand relevance, but the authors themselves are not widely known."
    525     }
    526   }
    527 }

Impressum · Datenschutz