scan-v5.json (27131B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Fun-tuning: Characterizing the Vulnerability of Proprietary LLMs to Optimization-Based Prompt Injection Attacks via the Fine-Tuning Interface", 6 "authors": [ 7 "Andrey Labunets", 8 "Nishit V. Pandya", 9 "Ashish Hooda", 10 "Xiaohan Fu", 11 "Earlence Fernandes" 12 ], 13 "year": 2025, 14 "venue": "IEEE Symposium on Security and Privacy", 15 "arxiv_id": "2501.09798", 16 "doi": "10.1109/SP61157.2025.00121" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "All abstract claims are backed by experimental results: loss-logprob proxy validated (Fig. 3, R²→1), attack success rates of 65.3% and 82.0% confirmed in Tables 3–4, and the utility-security tradeoff is documented via the Google mitigation disclosure.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "The ablation study (Section 6.4) isolates the fine-tuning loss signal by replacing it with random numbers while keeping all other parameters identical, providing adequate evidence that loss signal causes the improvement over random substitution.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": true, 35 "justification": "The paper explicitly frames results as proof-of-concept on Gemini's API and acknowledges in the meta-review response that 'it is not yet known which other services might be vulnerable'; Table 7 discusses other APIs without formal testing.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": true, 41 "justification": "The paper explicitly notes that the ablation ASR (43.8% and 61.3%) is surprisingly high above baseline, stating 'random token substitution strategy might also be effective against Gemini models' and attributing this partly to Gemini's inherent susceptibility.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper carefully distinguishes between the proxy optimization objective (training loss) and actual attack success rate (ASR evaluated by GPT-4o judge), and acknowledges that minimizing loss does not always guarantee successful injection.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": false, 55 "justification": "Section 7 is titled 'Discussion' and focuses on mitigations rather than limitations of the evaluation; there is no dedicated limitations or threats-to-validity section.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "No systematic threats-to-validity analysis is present; specific issues like the small PPL40 dataset (40 examples), GPT-4o judge bias, or API non-determinism are mentioned incidentally but not systematically addressed.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper clearly states the attack is demonstrated on Gemini's API specifically and frames it as proof-of-concept; Section 7 explicitly discusses which API settings would and would not allow the attack.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Acknowledgements explicitly state 'supported in part by gifts from Amazon and Google and by NSF award 2312119.'", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are clearly listed: UC San Diego and University of Wisconsin Madison.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": false, 87 "justification": "Google is both a funder (via gifts to the lab) and the primary target of the attacks evaluated; this creates a direct conflict of interest regardless of the critical findings.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No explicit competing interests or financial interests declaration appears beyond the funding acknowledgment; no statement of 'no competing interests' or patent/equity disclosures.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Key terms are formally defined: prompt injection (Eq. 5–7), logprobs (Eq. 2), graybox access, attack success rate, and the adversarial optimization objective are all given precise mathematical definitions.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "The 'Contributions' section explicitly lists two contributions: (1) new attack surface characterization of the fine-tuning interface, and (2) experimental evaluation of fun-tuning attacks on Gemini.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 8 provides a detailed related work section distinguishing this work from linguistic attacks, whitebox/graybox/blackbox automated attacks, covert malicious fine-tuning, and model stealing, clearly positioning the novel attack channel.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "Code is explicitly released at https://github.com/earlence-security/fun-tuning, stated in the Disclosure and Ethics section.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": false, 130 "justification": "The base PurpleLlama benchmark is public, but the specific PPL40 subset (40 randomly sampled examples with modified judge questions) is not explicitly released as a separate artifact.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "No requirements.txt, Dockerfile, or explicit dependency specifications are described in the paper; the code repository may contain these but the paper does not mention them.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "While Algorithms 1 and 2 describe the attack procedure formally, no step-by-step operational reproduction instructions (API setup, account configuration, exact commands) are provided in the paper.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": true, 150 "justification": "Standard deviations are reported for all main ASR results (e.g., 82.0 ± 4.2, 65.3 ± 3.8 in Tables 3–4) and for all transfer evaluation results in Tables 5–6.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No formal statistical significance tests (t-tests, Mann-Whitney, etc.) are performed for comparative claims; the paper relies solely on non-overlapping standard deviations as evidence of significance.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Tables 3 and 4 include an 'Improvement over baseline' factor (1.9x for Gemini 1.0 Pro, 2.4x for Gemini 1.5 Flash), providing effect size context alongside raw percentages.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "PPL40 contains only 40 examples; no power analysis or justification for why 40 examples provides sufficient statistical power for the reported claims is given.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": true, 174 "justification": "Standard deviations are consistently reported across all ASR tables; scoring is repeated 20 times (5 for transfer evaluation) to estimate variance across model non-determinism.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Two baselines are included: (1) unmodified PurpleLlama injections as the baseline, and (2) the ablation attack using random token substitution instead of fine-tuning loss.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "The ablation baseline (random substitution) is an appropriate contemporary comparison for an optimization-based method; comparing against NeuralExec-style approaches would have been stronger but the ablation design is sound.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 6.4 explicitly describes an ablation where only the fine-tuning loss signal is removed (replaced with random numbers) while all other attack components remain identical.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Results are reported across ASR (primary), improvement factor over baseline, number of fine-tuning requests, wall-clock time, and monetary cost.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": false, 205 "answer": false, 206 "justification": "Human evaluation is not applicable here; attack success is judged by GPT-4o with refined judge questions, which is standard for automated red-teaming benchmarks.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": false, 211 "answer": false, 212 "justification": "Not applicable; this is an attack evaluation, not a prediction task with train/test splits.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Figures 8 and 9 show ASR broken down by injection scenario (code, exercise, population, transaction, password, etc.) for both Gemini 1.0 Pro and 1.5 Flash.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "The password phishing category (~10-20% ASR) and code analysis failures against Gemini 1.5 Flash (40% ASR) are discussed with hypotheses about why they fail (safety tuning, improved code analysis).", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "Low ASR for password phishing scenarios is reported and discussed; the paper also notes that Gemini 1.0 Pro-sourced perturbations transfer less well to 1.5 Flash (~50-60% vs 87-88% within same family).", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Exact model version identifiers are given throughout: gemini-1.5-flash-001-tuning, gemini-1.0-pro-001, gemini-2.0-flash, gemini-1.5-pro-001, etc.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": false, 244 "justification": "The paper describes the prompt structure (system prompt + user prompt format, chat format using start_of_turn/end_of_turn tokens) but does not provide actual system prompts or the full judge question templates used.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Section 6.3 provides comprehensive hyperparameter details: 45 iterations, 2 restarts at iterations 15 and 30, 1000 candidates per iteration, 20-token prefix/suffix initialized with '!', learning rate range 10^-13 to 10^-45.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "No agentic scaffolding is involved; the paper evaluates direct API interactions with the Gemini fine-tuning endpoint.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "PPL40 construction is described: random sampling from PurpleLlama indirect examples, exclusion of token smuggling category, judge question modification process, and training example formatting are all documented.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "Code is available on GitHub but the paper does not state that raw experimental data (per-example ASR values, loss trajectories) is separately released for independent verification.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "The fine-tuning API interaction protocol is described in detail in Section 4, including how loss values are collected, the de-randomization procedure, and how ASR is measured via 20 repeated scoring runs.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants; data comes from a public benchmark (PurpleLlama) and API responses.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "The full pipeline from dataset construction (PPL40 sampling) through permutation recovery, candidate ranking via fine-tuning, and ASR evaluation is formally specified in Algorithms 1 and 2.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": false, 295 "answer": false, 296 "justification": "Not applicable; this paper evaluates the fine-tuning interface as an attack surface, not model knowledge or capabilities on benchmarks.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": false, 301 "answer": false, 302 "justification": "Not applicable for the same reason; the PurpleLlama attack prompts may be in training data but this would only strengthen the attack baseline, not contaminate the evaluation of the optimization method itself.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": false, 307 "answer": false, 308 "justification": "Not applicable; the evaluation measures attack success rates, not model capability on held-out knowledge tasks.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants in this study.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants; ethics discussion covers responsible disclosure, not IRB.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": true, 360 "justification": "Tables 3 and 4 report per-example inference cost: $0.18 for Gemini 1.0 Pro and $0.02 for Gemini 1.5 Flash; total cost for all attacks is stated as less than $10.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": true, 366 "justification": "Attack time is reported as 15 hours per example for Gemini 1.0 Pro and 60 hours for Gemini 1.5 Flash; fine-tuning was free at time of writing, so total budget is dominated by the reported inference cost.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Fine-tuning training loss is a useful proxy for log-probabilities of closed-weight LLMs, with R² approaching 1 as output length increases.", 375 "evidence": "Fig. 3 shows R² between training loss and average logprobs asymptotically approaching 1 for output lengths > 200 tokens across 10 open-ended prompts.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Fun-tuning achieves 82.0% ASR against Gemini 1.0 Pro and 65.3% against Gemini 1.5 Flash on the PPL40 benchmark.", 380 "evidence": "Tables 3 and 4 report 82.0 ± 4.2% and 65.3 ± 3.8% respectively, with 20 repeated scoring runs per example.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Fun-tuning outperforms both baseline (unmodified injections) and random token substitution ablation, with improvements outside of standard deviation.", 385 "evidence": "Table 3: Fun-tuning 82.0% vs. ablation 61.3% vs. baseline 42.5%; Table 4: 65.3% vs. 43.8% vs. 27.5%; non-overlapping standard deviations for fun-tuning vs. ablation.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Attacks generated against Gemini 1.5 Flash transfer to other Gemini models with >72% ASR, and transfer to Gemini 2.0 Flash with >89% ASR.", 390 "evidence": "Table 6 shows Gemini 1.5 Flash perturbations achieve 72–73% on all Gemini 1.0 Pro variants and 89–90.5% on 2.0 Flash.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "The entire attack costs less than $10 in inference fees due to free fine-tuning calls at the time of writing.", 395 "evidence": "Section 6.5 states 'all our attacks combined cost less than $10 in completions endpoint calls'; Tables 3–4 show $0.02–$0.18 per example.", 396 "supported": "strong" 397 }, 398 { 399 "claim": "The attack exploits a fundamental utility-security tradeoff that makes complete mitigation by restricting hyperparameters difficult without harming benign developers.", 400 "evidence": "Section 7 discusses why minimum learning rate restrictions harm benign fine-tuning use cases; Google's actual mitigation (capping LR and minimum batch size) is acknowledged as partial.", 401 "supported": "moderate" 402 }, 403 { 404 "claim": "Other LLM fine-tuning APIs (OpenAI, Anthropic via AWS) may have been or remain vulnerable to similar attacks based on their hyperparameter exposure.", 405 "evidence": "Table 7 shows OpenAI allowed learning rate multipliers down to 10^-31 before January 2025 and batch size of 1; Anthropic allows batch size 'auto' — but no empirical attacks were conducted on these APIs.", 406 "supported": "weak" 407 } 408 ], 409 "methodology_tags": [ 410 "benchmark-eval", 411 "case-study" 412 ], 413 "key_findings": "The paper demonstrates that fine-tuning APIs for closed-weight LLMs expose enough information to enable optimization-based prompt injection attacks: by setting a near-zero learning rate, the reported training loss approximates log-probabilities with R² approaching 1, enabling greedy discrete optimization without model weight access. Against Google's Gemini family, this achieves 65–82% attack success rate on the PurpleLlama benchmark at under $10 total cost, outperforming both unmodified injections and random token substitution. The attack transfers across Gemini model versions with high success rates (>72%), and Google deployed mitigations (capping learning rate and minimum batch size) in April 2025 following responsible disclosure.", 414 "red_flags": [ 415 { 416 "flag": "Small evaluation dataset", 417 "detail": "PPL40 has only 40 examples, which limits statistical power; no power analysis or justification for this sample size is provided." 418 }, 419 { 420 "flag": "Single API tested empirically", 421 "detail": "All empirical results are on Gemini only; claims about other APIs (Table 7) are based on hyperparameter inspection, not actual attack demonstration." 422 }, 423 { 424 "flag": "LLM-as-judge with known biases", 425 "detail": "GPT-4o is used as the judge model; the paper acknowledges that the original PurpleLlama judge questions were 'overly permissive' and required manual correction, raising questions about systematic judge errors remaining." 426 }, 427 { 428 "flag": "High ablation ASR undermines signal contribution", 429 "detail": "Random token substitution achieves 43.8–61.3% ASR vs. baseline 27.5–42.5%, suggesting Gemini's inherent susceptibility to token perturbation is a confound; the marginal contribution of the fine-tuning loss signal over random search is real but smaller than the headline numbers imply." 430 }, 431 { 432 "flag": "Funder conflict", 433 "detail": "Google is both a funder (via lab gifts) and the primary target of the attacks evaluated in the paper." 434 }, 435 { 436 "flag": "No formal significance testing", 437 "detail": "Comparisons between methods rely on non-overlapping standard deviations rather than formal hypothesis tests, which is insufficient for small-sample comparisons." 438 } 439 ], 440 "cited_papers": [ 441 { 442 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 443 "relevance": "Foundational work on indirect prompt injection attacks that established the threat model and real-world consequences this paper builds on." 444 }, 445 { 446 "title": "Universal and transferable adversarial attacks on aligned language models", 447 "relevance": "Introduced the Greedy Coordinate Gradient (GCG) algorithm that fun-tuning adapts for the graybox setting without gradient access." 448 }, 449 { 450 "title": "CybersecEval 2: A wide-ranging cybersecurity evaluation suite for large language models", 451 "relevance": "Source of the PurpleLlama prompt injection benchmark (PPL40) used as the evaluation dataset in this paper." 452 }, 453 { 454 "title": "Query-based adversarial prompt generation", 455 "relevance": "Prior graybox attack using logprobs that fun-tuning replaces with fine-tuning loss as an alternative information channel." 456 }, 457 { 458 "title": "PAL: Proxy-guided black-box attack on large language models", 459 "relevance": "Related graybox approach that requires logprobs; contrasted with fun-tuning which works when logprobs are unavailable." 460 }, 461 { 462 "title": "Neural Exec: Learning (and learning from) execution triggers for prompt injection attacks", 463 "relevance": "Whitebox prompt injection using GCG whose attack structure (prefix/suffix wrapping) fun-tuning adapts to the closed-weight setting." 464 }, 465 { 466 "title": "Covert malicious finetuning: Challenges in safeguarding LLM adaptation", 467 "relevance": "Orthogonal line of work on encoding malicious training data to evade pre-fine-tuning moderation, relevant to mitigation discussion." 468 }, 469 { 470 "title": "The instruction hierarchy: Training LLMs to prioritize privileged instructions", 471 "relevance": "Describes the safety fine-tuning approach (Gemini resistance to some injections) that fun-tuning works around." 472 } 473 ], 474 "engagement_factors": { 475 "practical_relevance": { 476 "score": 3, 477 "justification": "Directly actionable for security practitioners: demonstrates a novel attack channel against production APIs with a released tool, and Google's deployment of mitigations confirms real-world impact." 478 }, 479 "surprise_contrarian": { 480 "score": 3, 481 "justification": "The insight that a 'free' feature (fine-tuning loss reporting) can circumvent logprob restrictions that vendors deployed as security mitigations is genuinely counterintuitive." 482 }, 483 "fear_safety": { 484 "score": 3, 485 "justification": "Shows that safety-tuned closed-weight models can be attacked via a non-obvious API side-channel, undermining confidence in the completeness of vendor security measures." 486 }, 487 "drama_conflict": { 488 "score": 2, 489 "justification": "Responsible disclosure narrative (disclosed Nov 2024, patched April 2025) and the implicit tension of Google funding research that attacks Google's products adds drama." 490 }, 491 "demo_ability": { 492 "score": 2, 493 "justification": "Code is publicly available, but Google's April 2025 mitigation (capping learning rate and minimum batch size) means the attack no longer works on the primary target as described." 494 }, 495 "brand_recognition": { 496 "score": 3, 497 "justification": "Explicitly about Google Gemini with named model versions; Table 7 also discusses OpenAI and Anthropic APIs, maximizing brand recognition for the LLM security audience." 498 } 499 }, 500 "hn_data": { 501 "threads": [ 502 { 503 "hn_id": "42789001", 504 "title": "VideoWorld: Exploring Knowledge Learning from Unlabeled Video", 505 "points": 2, 506 "comments": 0, 507 "url": "https://news.ycombinator.com/item?id=42789001" 508 } 509 ], 510 "top_points": 2, 511 "total_points": 2, 512 "total_comments": 0 513 } 514 }