ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32837B)


      1 {
      2   "paper": {
      3     "title": "Fun-tuning: Characterizing the Vulnerability of Proprietary LLMs to Optimization-based Prompt Injection Attacks via the Fine-Tuning Interface",
      4     "authors": [
      5       "Andrey Labunets",
      6       "Nishit V. Pandya",
      7       "Ashish Hooda",
      8       "Xiaohan Fu",
      9       "Earlence Fernandes"
     10     ],
     11     "year": 2025,
     12     "venue": "IEEE Symposium on Security and Privacy",
     13     "arxiv_id": "2501.09798",
     14     "doi": "10.1109/SP61157.2025.00121"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "The fine-tuning interface of proprietary LLMs leaks loss-like information that can guide optimization-based prompt injection attacks on closed-weight models. Using the Gemini fine-tuning API with a very small learning rate, the authors achieved 65% and 82% attack success rates on Gemini 1.5 Flash and 1.0 Pro respectively on the PurpleLlama benchmark. The attacks cost under $10 total and transfer across Gemini model variants with 50-90% success rates. Google deployed mitigations (minimum learning rate cap, minimum batch size of 4) following responsible disclosure.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Code is released at https://github.com/earlence-security/fun-tuning, stated in the Disclosure and Ethics section: 'Code is available at https://github.com/earlence-security/fun-tuning.'"
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The evaluation uses the PurpleLlama CyberSecEval benchmark (publicly available). PPL40 is a 40-example subset sampled from the public benchmark with the sampling criteria described in Section 6.1."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No environment specification (requirements.txt, Dockerfile, or detailed dependency listing) is provided in the paper."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper provides algorithmic pseudocode (Algorithms 1 and 2) and attack configuration details, but no step-by-step reproduction instructions (README with commands, reproduction scripts) are described in the paper itself."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Standard deviations are reported for all ASR results: e.g., '82.0 ± 4.2' (Table 3), '65.3 ± 3.8' (Table 4), and transfer evaluation results in Tables 5-6 all include ± values."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No formal significance tests (p-values, t-tests, etc.) are used. The paper claims improvements are 'outside of standard deviation' (Tables 3-4 captions) but this is not a formal statistical test."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Improvement factors over baseline are reported: 1.9x for Gemini 1.0 Pro and 2.4x for Gemini 1.5 Flash (Tables 3-4). Absolute ASR values with baselines provide context for the magnitude of improvement."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The choice of 40 examples (PPL40) from the 56-example PurpleLlama benchmark is not justified with power analysis or reasoning about statistical adequacy. The paper states they 'randomly sampled 40 indirect prompt injection examples' for 'quicker exploration' (Section 6.1)."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Variance is reported across 20 scoring runs for primary attacks and 5 runs for transfer evaluation: e.g., '82.0 ± 4.2' (Table 3). Section 6.2 states: 'we repeat this procedure 20 times... and report the mean and the standard deviation.'"
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Two baselines are included: (1) unmodified PurpleLlama injections without optimization (baseline ASR), and (2) an ablation attack using random numbers instead of training losses (Section 6.4, Tables 3-4)."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The PurpleLlama benchmark (2024) provides the base prompt injections, and the ablation baseline is a matched variant of the proposed method. The paper also contextualizes results against Google's own internal red-teaming evaluation (Section 6.5, reference to Gemini 1.5 report)."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Section 6.4 describes a principled ablation: 'we only removed the effects of the fine-tuning procedure. Instead of receiving the true training losses for each candidate, this algorithm received random numbers. All other attack parameters... were kept the same.'"
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Multiple metrics are reported: attack success rate (ASR), improvement factor over baseline, number of fine-tuning requests, wall-clock time, and financial cost (Tables 3-4). Per-scenario and per-iteration breakdowns are also provided (Figs. 5-9)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "All evaluation is automated using GPT-4o as judge (Section 6.2). No human evaluation of attack outputs is performed, though the authors manually revised judge questions to be stricter."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "The 40 PPL40 examples are used both for attack optimization and for measuring success rates. There is no separate held-out set. The transfer evaluation (Tables 5-6) tests on other models but the same 40 examples."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Figures 8 and 9 show per-scenario ASR breakdowns (code, exercise, population, transaction, password, zubrowka, resume, employee) for both target models. Table 9 shows the attack category distribution."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 6.5 discusses failure cases: 'attacks are least successful for the password category' (~10-20% ASR), and code analysis fails against Gemini 1.5 Flash (40% ASR). They hypothesize reasons: 'Gemini models were trained to resist phishing in some way' and 'the newer model is significantly better at code analysis.'"
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Several negative results are reported: password phishing largely fails, code analysis attacks fail on Gemini 1.5 Flash, and the ablation (random substitution) is 'surprisingly' effective at 43.8%/61.3% ASR (Section 6.5). They also note the Gemini 1.5 Flash attack is much slower (60 hours vs 15 hours per example)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims of '65% and 82%' ASR are supported by Tables 3 (82.0 ± 4.2%) and 4 (65.3 ± 3.8%). The claim about 'loss-like information' being useful for optimization is supported by Sections 4.2-4.3 (Figs. 2-4)."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The central causal claim — that training loss signal improves attack success — is justified via the ablation study (Section 6.4) that removes only the loss signal while keeping all other components identical. Fun-tuning (82%) vs. ablation (61.3%) on Gemini 1.0 Pro isolates the contribution of the loss signal."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title claims 'Proprietary LLMs' broadly, but only Google's Gemini models are tested. The meta-review explicitly flags: 'The attack has been demonstrated on a single service, so it is not yet known which other services might be vulnerable.' Table 7 discusses other APIs speculatively but without experimental evidence."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "The paper discusses that random token substitution is 'surprisingly' effective (43.8%/61.3% ablation ASR), that some PurpleLlama attacks are naturally effective (27.5%/42.5% baseline), and that Gemini's safety tuning explains failures in specific categories (password, code). Section 6.5 considers multiple factors contributing to results."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Section 6.2 explicitly addresses the gap between automated scoring and true attack success. They redesigned the PurpleLlama judge questions to be stricter, noting that original questions 'are overly permissive and result in too many false positives.' Table 2 shows a concrete false-positive/false-negative analysis."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Specific model version identifiers are used throughout: 'gemini-1.5-flash-001-tuning', 'gemini-1.0-pro-001', 'gemini-1.5-pro-001', 'gemini-2.0-flash' (Section 6.3). For scoring, 'GPT-4o' is specified as the judge."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Example prompts are shown in Figures 1 and 10 with adversarial payloads highlighted. The prompts derive from the public PurpleLlama benchmark. Section 6.3 describes the chat format and token delimiters (<start_of_turn>, <end_of_turn>). Code is released containing full prompt implementations."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Detailed hyperparameters in Section 6.3: 45 iterations, 2 restarts at iterations 15 and 30, 20-token prefix/suffix initialized with '!', 1000 candidates per iteration, batch size 1, learning rate ~10^-45, and inference temperature settings (default and 0)."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The system is a direct optimization algorithm that interacts with the fine-tuning API, not an agent pipeline."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 6.1 describes sampling 40 indirect injection examples from PurpleLlama's 56, excluding token smuggling category and non-standard encodings. Table 9 shows the distribution of attack types in PPL40 vs. PurpleLlama. Section 6.2 describes the judge question modifications."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. Section 7 (Discussion) discusses mitigations and attack universality across other APIs, but focuses on the mitigation landscape rather than methodological limitations of the study itself."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No specific threats to validity are discussed. The paper does not address issues like the representativeness of PPL40, the validity of GPT-4o as judge, potential confounds from Gemini safety training, or the approximate permutation recovery error affecting results."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "While Section 7 states 'we established the feasibility of our method specifically against the Gemini API,' the paper does not systematically state what the results do NOT show. The meta-review's noteworthy concern — that only a single service was tested — is not explicitly addressed as a scope boundary in the paper."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The raw training loss data from fine-tuning experiments, intermediate optimization results, and per-example attack outputs are not released. Only aggregated ASR results are reported in the paper."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The data collection is well described: PurpleLlama benchmark selection (Section 6.1), fine-tuning API interaction procedure (Sections 4-5), scoring methodology via GPT-4o (Section 6.2), and the 20-repetition scoring protocol."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. Data source is the standard PurpleLlama benchmark."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The pipeline from PurpleLlama benchmark → PPL40 subset selection (with exclusion criteria stated) → attack optimization (Algorithms 1-2) → scoring via GPT-4o judge → ASR aggregation is documented across Sections 5-6."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Acknowledgements section states: 'This work is supported in part by gifts from Amazon and Google and by NSF award 2312119.'"
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: UC San Diego and University of Wisconsin Madison. No authors are affiliated with Google (the vendor whose product is attacked)."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "Google is both a funder ('gifts from... Google') and the vendor whose product (Gemini) is shown to be vulnerable. Although the findings go against Google's interest in Gemini being perceived as secure, Google has a clear financial stake in the outcome."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement or financial interest declarations appear in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "The training data cutoff dates for the Gemini models used are not stated. This is relevant because PurpleLlama prompt patterns could appear in Gemini's safety training data, affecting measured ASR."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No discussion of whether Gemini may have been specifically trained to resist PurpleLlama attacks. The paper notes 'Recent LLM products have been fine-tuned specifically to resist prompt injection attempts' (Section 3) but does not discuss benchmark-specific overlap."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "PurpleLlama was publicly released in 2024 and Gemini models could have incorporated these patterns into safety training. This is not discussed, though it would deflate rather than inflate the reported ASR."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. The Disclosure and Ethics section addresses responsible disclosure to Google, not human subjects research."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Per-example costs are reported: $0.18 for Gemini 1.0 Pro, $0.02 for Gemini 1.5 Flash (Tables 3-4). Total attack cost is stated: 'all attacks combined cost < $10.' Time per example: 15 hours (Gemini 1.0 Pro) and 60 hours (Gemini 1.5 Flash)."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Compute budget is reported: 90 fine-tuning calls per example, each taking ~10 minutes (Gemini 1.0 Pro) or ~40 minutes (Gemini 1.5 Flash). Total time per attack: 15 hours and 60 hours respectively. Fine-tuning calls are free; inference costs are the financial budget."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "The optimization algorithm uses random token sampling but is not run across multiple seeds. Variance is reported across 20 scoring runs (non-deterministic inference), but the optimization itself runs once per example with no seed sensitivity analysis."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "Section 6.2 states: 'we repeat this procedure 20 times to evaluate the primary attack (5 times for the transfer evaluation) and report the mean and the standard deviation.' Optimization runs 45 iterations with 2 restarts."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No systematic hyperparameter search is described. Section 6.6 studies the effect of candidate set size locally but the main attack parameters (45 iterations, 2 restarts, 20-token prefix/suffix) are not justified through a search budget."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "The per-example best perturbation is selected as the maximum-scoring iteration across all 45 iterations (Section 6.3: 'we stored the perturbations found at each iteration so we could identify the best one'). Section 6.6 justifies the candidate set size of 1000 through local simulation (Fig. 7)."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "Multiple comparisons are made across 10 model configurations, 3 methods, and 8 scenarios (Tables 3-6, Figs. 8-9) without any correction for multiple comparisons."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors implement and evaluate their own attack against their own baselines and ablation. No acknowledgment of author-evaluation bias or use of independent evaluation."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": true,
    332         "justification": "Figures 5-6 show ASR as a function of iterations (compute). Figure 7 shows attack success rate and loss as a function of candidate set size. The time and cost per example are reported alongside ASR."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper uses PurpleLlama as the benchmark without discussing whether it accurately represents real-world prompt injection risk. While Section 6.2 improves the judge questions, no discussion of whether the benchmark's scenarios, attack categories, or success criteria are valid measures of actual vulnerability."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is involved. The attack directly targets the model through the fine-tuning interface, not through any scaffold or agent framework."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "No discussion of temporal leakage. PurpleLlama was released in 2024 and the Gemini models may have incorporated these attack patterns into safety training before the experiments."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "Not discussed. The fine-tuning interface setup could theoretically provide different information than what a real attacker would have in some deployment scenarios."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "Not discussed. The 40 PPL40 examples may share structural similarities (from the same benchmark creators) that could inflate or deflate aggregate ASR."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No concrete leakage detection or prevention method is used."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Fun-tuning achieves 82% attack success rate on Gemini 1.0 Pro and 65% on Gemini 1.5 Flash on the PurpleLlama benchmark.",
    371       "evidence": "Tables 3 and 4 report 82.0 ± 4.2% ASR for Gemini 1.0 Pro and 65.3 ± 3.8% for Gemini 1.5 Flash, averaged over 20 scoring runs with standard deviations.",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "The fine-tuning training loss is linearly proportional to average logprobs for long output sequences.",
    376       "evidence": "Section 4.2, Figure 3 shows R² approaching 1 as output length increases. The hypothesized relationship TrainingLoss(Y|X) = K(X) + l·AvgLogprobs(Y|X) is validated empirically with 10 open-ended questions.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "The training loss serves as a useful proxy signal for discrete optimization of adversarial prompts, even for short target strings.",
    381       "evidence": "Section 4.3, Figure 4 shows that the candidate selected by training loss is ranked among the top few by true logprobs, with distributions skewed left across three test questions with M=100 samples and N=10 candidates.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "The improvement over baseline is due to the training loss signal, not other attack components.",
    386       "evidence": "Ablation study (Section 6.4) replaces training losses with random numbers while keeping all other parameters identical. Fun-tuning ASR (82.0%/65.3%) exceeds ablation ASR (61.3%/43.8%) with improvements outside standard deviation (Tables 3-4).",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Attacks computed against one Gemini model transfer to other Gemini models with relatively high success rates.",
    391       "evidence": "Tables 5-6 show transfer ASR: Gemini 1.0 Pro attacks transfer to other 1.0 Pro variants at >80% ASR and to 1.5 Flash at 49-56% ASR. Gemini 1.5 Flash attacks transfer to 2.0 Flash at ~90% ASR and to 1.0 Pro variants at >71% ASR.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "The approximate permutation recovery method misidentifies only ~7-8% of positions.",
    396       "evidence": "Table 8 (Appendix) shows normalized Hamming distance of 0.036-0.074 and Kendall correlation of 0.898-0.947 across training set sizes 100-1000, averaged over 5 approximate permutations.",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "The Gemini fine-tuning API applies a fixed, deterministic permutation to training losses for a given dataset size.",
    401       "evidence": "Section 4.1 establishes this through controlled experiments with duplicated training data of known cardinalities and repeated fine-tuning requests with different data but same sizes.",
    402       "supported": "moderate"
    403     }
    404   ],
    405   "red_flags": [
    406     {
    407       "flag": "Single vendor tested despite broad title",
    408       "detail": "The title says 'Proprietary LLMs' but only Google's Gemini is experimentally evaluated. Table 7 discusses other APIs (OpenAI, Anthropic) speculatively but provides no experimental evidence. The meta-review explicitly flags this as a noteworthy concern."
    409     },
    410     {
    411       "flag": "LLM-as-judge without validation",
    412       "detail": "GPT-4o is used as the sole judge for attack success. While the authors improved the judge questions (Section 6.2), they did not validate GPT-4o's judging accuracy against human evaluators, introducing potential systematic bias in ASR measurements."
    413     },
    414     {
    415       "flag": "No formal significance testing",
    416       "detail": "Comparisons between fun-tuning and baselines/ablation are described as having improvements 'outside of standard deviation' but no formal significance tests are applied. With only 40 examples and multiple comparisons across models and scenarios, this risks overstating the certainty of improvements."
    417     },
    418     {
    419       "flag": "Funder conflict",
    420       "detail": "Google is acknowledged as a funder ('gifts from... Google') while the paper demonstrates vulnerabilities in Google's Gemini product. Although the findings go against Google's interest, the funding relationship is not discussed as a potential conflict."
    421     },
    422     {
    423       "flag": "Ablation surprisingly strong",
    424       "detail": "The ablation (random substitution) achieves 43.8%/61.3% ASR vs. baseline 27.5%/42.5%, meaning much of the attack improvement comes from random token wrapping rather than loss-guided optimization. The incremental contribution of the loss signal (ablation→fun-tuning) is smaller than the random wrapping effect (baseline→ablation)."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Universal and transferable adversarial attacks on aligned language models",
    430       "authors": ["A. Zou", "Z. Wang", "J. Z. Kolter", "M. Fredrikson"],
    431       "year": 2023,
    432       "relevance": "Foundational whitebox GCG algorithm for adversarial prompt optimization that this paper extends to the graybox setting."
    433     },
    434     {
    435       "title": "Cyberseceval 2: A wide-ranging cybersecurity evaluation suite for large language models",
    436       "authors": ["M. Bhatt", "S. Chennabasappa", "Y. Li"],
    437       "year": 2024,
    438       "arxiv_id": "2404.13161",
    439       "relevance": "PurpleLlama benchmark used as the primary evaluation dataset for prompt injection attacks in this paper."
    440     },
    441     {
    442       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    443       "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"],
    444       "year": 2023,
    445       "relevance": "Seminal work on indirect prompt injection attacks against LLM-integrated applications, establishing the threat model this paper builds upon."
    446     },
    447     {
    448       "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks",
    449       "authors": ["D. Pasquini", "M. Strohmeier", "C. Troncoso"],
    450       "year": 2024,
    451       "arxiv_id": "2403.03792",
    452       "relevance": "Whitebox prompt injection attack using GCG algorithm; fun-tuning adopts its prefix-suffix wrapping style for adversarial prompt structure."
    453     },
    454     {
    455       "title": "Query-based adversarial prompt generation",
    456       "authors": ["J. Hayase", "E. Borevkovic", "N. Carlini", "F. Tramèr", "M. Nasr"],
    457       "year": 2024,
    458       "arxiv_id": "2402.12329",
    459       "relevance": "Graybox attack using logprobs from inference endpoints; fun-tuning extends this by using fine-tuning loss when logprobs are unavailable."
    460     },
    461     {
    462       "title": "Covert malicious finetuning: Challenges in safeguarding llm adaptation",
    463       "authors": ["D. Halawi", "A. Wei", "E. Wallace", "T. T. Wang", "N. Haghtalab", "J. Steinhardt"],
    464       "year": 2024,
    465       "arxiv_id": "2406.20053",
    466       "relevance": "Demonstrates attacks on LLMs through fine-tuning APIs using encoded malicious data; orthogonal to fun-tuning which uses loss signals rather than weight updates."
    467     },
    468     {
    469       "title": "The instruction hierarchy: Training llms to prioritize privileged instructions",
    470       "authors": ["E. Wallace", "K. Xiao", "R. Leike", "L. Weng", "J. Heidecke", "A. Beutel"],
    471       "year": 2024,
    472       "arxiv_id": "2404.13208",
    473       "relevance": "Defense mechanism training LLMs to resist prompt injection by prioritizing system instructions over user/third-party inputs."
    474     },
    475     {
    476       "title": "Injecagent: Benchmarking indirect prompt injections in tool-integrated large language model agents",
    477       "authors": ["Q. Zhan", "Z. Liang", "Z. Ying", "D. Kang"],
    478       "year": 2024,
    479       "arxiv_id": "2403.02691",
    480       "relevance": "Benchmark for evaluating indirect prompt injection in tool-integrated LLM agents, directly relevant to the agentic attack scenarios discussed."
    481     },
    482     {
    483       "title": "Stealing part of a production language model",
    484       "authors": ["N. Carlini", "D. Paleka", "K. D. Dvijotham"],
    485       "year": 2024,
    486       "arxiv_id": "2403.06634",
    487       "relevance": "Demonstrates extraction of information from closed-weight LLMs through API access; related attack surface of reverse-engineering proprietary models."
    488     },
    489     {
    490       "title": "Jailbreaking black box large language models in twenty queries",
    491       "authors": ["P. Chao", "A. Robey", "E. Dobriban", "H. Hassani", "G. J. Pappas", "E. Wong"],
    492       "year": 2024,
    493       "arxiv_id": "2310.08419",
    494       "relevance": "PAIR algorithm for blackbox jailbreaking; represents the blackbox attack approach that fun-tuning's graybox method aims to outperform."
    495     },
    496     {
    497       "title": "Struq: Defending against prompt injection with structured queries",
    498       "authors": ["S. Chen", "J. Piet", "C. Sitawarin", "D. Wagner"],
    499       "year": 2024,
    500       "arxiv_id": "2402.06363",
    501       "relevance": "Defense against prompt injection using structured query formatting to separate trusted and untrusted inputs."
    502     },
    503     {
    504       "title": "Pal: Proxy-guided black-box attack on large language models",
    505       "authors": ["C. Sitawarin", "N. Mu", "D. Wagner", "A. Araujo"],
    506       "year": 2024,
    507       "arxiv_id": "2402.09674",
    508       "relevance": "Proxy-guided black-box attack using logprobs; surveys API access levels across vendors (Table 6 referenced by fun-tuning)."
    509     }
    510   ],
    511   "engagement_factors": {
    512     "practical_relevance": {
    513       "score": 2,
    514       "justification": "Security researchers and red teamers can adapt the technique, though the specific Gemini exploit was mitigated by Google in April 2025."
    515     },
    516     "surprise_contrarian": {
    517       "score": 2,
    518       "justification": "Challenges the assumption that closed-weight models are safe from optimization-based attacks by revealing that fine-tuning APIs leak sufficient loss information."
    519     },
    520     "fear_safety": {
    521       "score": 3,
    522       "justification": "Demonstrates a novel, practical attack surface on proprietary LLMs that exploits a fundamental utility-security tradeoff in fine-tuning interfaces."
    523     },
    524     "drama_conflict": {
    525       "score": 1,
    526       "justification": "Responsible disclosure to Google was handled cooperatively; no major controversy beyond the technical vulnerability itself."
    527     },
    528     "demo_ability": {
    529       "score": 2,
    530       "justification": "Code released on GitHub, but the specific Gemini exploit has been patched and requires API access to test."
    531     },
    532     "brand_recognition": {
    533       "score": 2,
    534       "justification": "Targets Google Gemini (major product), published at IEEE S&P (top security venue), from UCSD researchers."
    535     }
    536   }
    537 }

Impressum · Datenschutz