ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (33127B)


      1 {
      2   "paper": {
      3     "title": "RL Is a Hammer and LLMs Are Nails: A Simple Reinforcement Learning Recipe for Strong Prompt Injection",
      4     "authors": [
      5       "Yuxin Wen",
      6       "Arman Zharmagambetov",
      7       "Ivan Evtimov",
      8       "Narine Kokhlikyan",
      9       "Tom Goldstein",
     10       "Kamalika Chaudhuri",
     11       "Chuan Guo"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv (Preprint, Under Review)",
     15     "arxiv_id": "2510.04885",
     16     "doi": "10.48550/arXiv.2510.04885"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "RL-Hammer, a GRPO-based RL recipe for training prompt injection attackers from scratch, achieves 98% ASR against GPT-4o and 72% against GPT-5 by jointly training on easy and robust target models with soft rewards and no KL regularization. The paper demonstrates that current defenses (Instruction Hierarchy, SecAlign) remain fragile against automated RL-based attacks. Diversity rewards lead to reward-hacking rather than genuinely novel attack strategies. RL-Hammer naturally evades most prompt injection detectors and can fully bypass all tested detectors when trained with a stealthiness reward.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The abstract states 'Code is available at https://github.com/facebookresearch/rl-injector' — a concrete GitHub URL is provided."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper uses publicly available benchmarks: InjecAgent (Zhan et al., 2024), AgentDojo (Debenedetti et al., 2024), and AdvBench (Zou et al., 2023). No proprietary data was collected."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions 'a single NVIDIA H200 node' and 'Hugging Face TRL library' with LoRA fine-tuning, but provides no requirements.txt, Dockerfile, or detailed library version listing sufficient to recreate the environment."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repository is referenced but the paper itself does not contain commands or a reproduction guide."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All results in Tables 1, 3, 4, 5, and 7 are reported as point estimates (e.g., '98.00%') with no confidence intervals or error bars."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper makes many comparative claims (e.g., RL-Hammer vs. GCG, naive GRPO vs. RL-Hammer) based solely on comparing raw ASR numbers without any statistical significance tests."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "ASR percentages are reported alongside baselines, providing sufficient context to assess magnitude. For example, GPT-4o goes from 0% (baseline) to 98% (RL-Hammer), and the jailbreak comparison shows RL-Hammer at 99% vs Jailbreak-R1 at 47% (Table 4)."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The test set is 100 samples from InjecAgent with no justification for why this size is sufficient. No power analysis is discussed."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No standard deviation, variance, or spread measures are reported for any results. Section 5.2 acknowledges that 'different random seeds can lead the model to settle on distinct strategies,' yet the main results appear to be from single runs."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Table 1 includes four baselines: Default Prompt, Enhanced, Llama-3.1-8B-Instruct (zero-shot), and GCG. Table 4 compares against Jailbreak-R1. Table 3 compares against Default Prompt and Tool Knowledge."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "GCG (Zou et al., 2023) and Jailbreak-R1 (Guo et al., 2025) are contemporary and represent the state of the art in adversarial prompt generation."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Section 5.1 presents four ablation studies: KL coefficient (Figure 1a), joint training (Figure 1b), soft vs. hard reward (Figure 1c), and format reward (Figure 1d). Each isolates a single design choice."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The paper evaluates ASR as the primary metric, plus diversity metrics (BLEU, BERTScore, embedding distance, LLM judge) in Table 5 and detectability rates across four detectors in Table 7."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "All evaluation is automated. Attack success is determined by string parsing. Diversity and detectability use automated metrics and LLM judges. No human evaluation of attack quality or realism is conducted."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 4.1 states: 'We use the InjecAgent dataset, splitting it into 100 samples for validation, 100 samples for testing, and the remaining 310 samples for training.' The same split approach is used for AgentDojo and AdvBench."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Table 1 provides per-target-model breakdowns across 10 models. Results are also broken down by benchmark (InjecAgent, AgentDojo, AdvBench) and by training-time target model."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 5.2 discusses how diversity rewards lead to reward-hacking rather than genuine diversity, with examples in Table 6. Section 3.1 discusses naive GRPO's failure on robust models. Appendix Table 9 shows degenerate outputs without format reward."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Naive GRPO fails on defended models (Section 3.1, Figure 2b). Training on easy model alone fails to transfer (0% ASR on GPT-4o, Table 1). Diversity rewards are reward-hacked (Section 5.2). Format reward alone leads to overfitting on format compliance over attack success (Section 3.2)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The abstract claims '98% ASR against GPT-4o' (supported in Table 1) and '72% ASR against GPT-5' (supported). However, the paper claims 'RL-Hammer attains ≥80% ASR on every evaluated target' (Section 4.2), yet Claude-4-Sonnet achieves only 77% even when trained directly on it, contradicting the ≥80% claim."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The ablation studies in Section 5.1 make causal claims (removing KL improves ASR, joint training enables learning on robust models) through controlled single-variable manipulation. Each ablation changes one factor while holding others constant, which is adequate for the causal claims made."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper tests on 10 specific target models and 3 benchmarks, and generally frames claims in terms of these tested settings. The title is provocative ('LLMs Are Nails') but the body consistently references specific models and benchmarks."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper does not consider alternative explanations for its results. For example, it does not discuss whether the high ASR might be partly due to the InjecAgent task design (only two tools, worst-case scenario by their own admission), or whether the string-parsing judge may be over-counting successes."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures ASR (whether the target model executes the attacker-specified action, verified by string parsing) and frames it as attack success rate. The measurement matches the framing with no proxy gap."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Commercial models are referenced by marketing names only: 'GPT-4o', 'GPT-5', 'GPT-5-mini', 'Gemini-2.5-Flash', 'Claude-3.5-Sonnet', 'Claude-4-Sonnet'. No API version strings or snapshot dates are provided. Open-source models are specified (e.g., 'Llama-3.1-8B-Instruct') but still lack exact version hashes."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The attacker prompt is provided in full in Section A.1. The LLM diversity judge prompt is in A.7. The LLM prompt injection judge prompt is in A.8. These are the actual prompts used, not just descriptions."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 4.1 reports: '8 rollouts per injection goal with a batch size of 8 injection goals and a learning rate of 1e-5 with 40 epochs.' LoRA is used for fine-tuning. The KL coefficient ablation (β values) is in Section 5.1."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "The RL-Hammer attacker is a standard RL-trained LLM that generates text, not an agentic scaffold. Target models use tool-calling features but are evaluated as black boxes. No agentic scaffolding is central to the method."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 4.1 describes the dataset split: 'We use the InjecAgent dataset, splitting it into 100 samples for validation, 100 samples for testing, and the remaining 310 samples for training.' Similar splits are described for AgentDojo and AdvBench."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 6 is titled 'CONCLUSION AND LIMITATION' and contains substantive discussion of training cost, query requirements, and potential for detection/banning by model providers."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The limitations section identifies a specific threat: 'Training remains costly, largely due to the large number of queries required to interact with the target model. In practice, excessive adversarial queries may trigger detection and banning by model providers.' This is specific to this study's approach."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound the scope to specific deployment contexts, does not discuss what attack scenarios were not tested, and does not clarify that InjecAgent's two-tool setup is a simplification of real-world scenarios (though it mentions AgentDojo is 'more realistic')."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "No raw experimental data (per-example attack results, model outputs, training logs) is released. Only aggregate ASR numbers are reported in tables."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The paper uses three public benchmarks (InjecAgent, AgentDojo, AdvBench) with clear provenance. Dataset splits and usage are described in Section 4.1."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. Data sources are standard public benchmarks."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The pipeline from benchmark dataset to final ASR is documented: split into train/val/test → GRPO training with specified rollouts and batch size → attack generation → string-parsing verification → ASR computation. No unexplained data loss or filtering."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No explicit funding acknowledgment section. The work was done during an internship at FAIR at Meta, implying Meta funding, but no formal disclosure is provided."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: University of Maryland and FAIR at Meta. The footnote states 'Work done during an internship at FAIR at Meta.'"
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "Meta develops both the defense evaluated (Meta SecAlign) and the attack (RL-Hammer). Meta has commercial interests in LLM safety. The funder is not independent of the outcome, though the paper does show Meta's own defense is vulnerable."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interest declaration is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "This paper tests defenses against prompt injection attacks, not model knowledge on benchmarks. The evaluation measures whether defenses can be bypassed, not whether models have memorized benchmark answers."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "Same rationale: the paper evaluates defense robustness against automated attacks, not model capability on benchmark tasks."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "Same rationale: contamination of benchmark knowledge is not the concern when testing attack effectiveness against defense mechanisms."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study. All experiments are automated attacks against LLM models."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants. The paper includes an ethics statement about responsible disclosure of the attack."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No inference cost, API cost, or per-attack cost is reported. The limitations section acknowledges 'Training remains costly, largely due to the large number of queries required' but does not quantify costs."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "The paper states 'All experiments are conducted on a single NVIDIA H200 node' but does not report total GPU hours, wall-clock training time, or total API spend for querying commercial target models."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Section 5.2 acknowledges 'different random seeds can lead the model to settle on distinct strategies,' indicating seed sensitivity, but the main ASR results in Tables 1, 3, 4 do not report results across multiple seeds."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs producing the main results is never stated. Results appear to be from single training runs."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Hyperparameters are reported (learning rate, batch size, epochs) but no search budget, number of configurations tried, or search method is described."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "No explanation of how the final hyperparameter configuration was selected. The ablation studies vary individual parameters but do not describe a systematic selection process."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No statistical tests are performed in the paper, so correction for multiple comparisons does not arise."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors evaluate their own RL-Hammer against baselines without acknowledging self-comparison bias. They also evaluate against their own Meta SecAlign defense without discussing whether their insider knowledge of SecAlign's architecture could influence attack design."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "Performance is not reported as a function of compute budget. The training curves in Figure 1 show reward vs. training steps but not wall-clock time or compute cost. Different target models likely require vastly different API query budgets, which is not discussed."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper does not discuss whether InjecAgent (its primary benchmark) adequately measures real-world prompt injection risk. It briefly notes AgentDojo is 'more realistic and complex' and that InjecAgent represents 'a kind of worst-case testing scenario' with only two tools, but does not analyze construct validity."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "Section 4.1 states 'For white-box models, we use prompting to enable tool usage, whereas for all black-box models, we rely on the tool-calling feature.' This means results across models are confounded by different scaffolding approaches, which is not discussed."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether InjecAgent or AdvBench examples may have appeared in the training data of target models. The attacker is trained from scratch, but target models may have been exposed to the benchmark tasks during pre-training or safety training."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the evaluation setup leaks information. For example, the two-tool InjecAgent setup (benign tool + adversary's target tool) is an extreme simplification that may make attacks artificially easier."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether training and test splits from InjecAgent share structural similarities that could inflate results."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No leakage detection or prevention method is used for the benchmarks."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "RL-Hammer achieves 98% ASR against GPT-4o when jointly trained on Llama-3.1-8B-Instruct and GPT-4o.",
    373       "evidence": "Table 1 shows 98% ASR on GPT-4o when GPT-4o is the training-time target model. Baseline methods achieve 0-3% on GPT-4o.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "RL-Hammer achieves 72% ASR against GPT-5 with Instruction Hierarchy defense.",
    378       "evidence": "Table 1 shows 72% ASR on GPT-5 when jointly trained on GPT-5-mini and GPT-5. Baseline methods achieve 0% on GPT-5.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "RL-Hammer attains ≥80% ASR on every evaluated target.",
    383       "evidence": "Table 1 shows best results per target model. However, Claude-4-Sonnet achieves only 77% ASR when trained directly on it, which is below the stated 80% threshold.",
    384       "supported": "weak"
    385     },
    386     {
    387       "claim": "Removing KL regularization improves attacker performance and convergence speed.",
    388       "evidence": "Figure 1a ablation shows β=0 achieves near-perfect rewards while β=0.01 converges to a suboptimal minimum. Section 5.1 provides analysis.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Joint training on easy and robust target models overcomes sparse reward and enables high ASR on robust models.",
    393       "evidence": "Figure 1b shows direct training on the robust model produces near-zero rewards, while joint training achieves high rewards on both models. Section 3.2 describes the transfer mechanism.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Diversity rewards lead to reward-hacking rather than genuinely diverse attack strategies.",
    398       "evidence": "Table 5 shows diversity metrics improve under each reward, but Table 6 demonstrates that the attacker manipulates metrics via superficial changes (case variation, prepended irrelevant text) rather than producing novel strategies.",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "RL-Hammer naturally evades most prompt injection detectors and fully bypasses all four when trained with stealthiness reward.",
    403       "evidence": "Table 7 shows 0% perplexity detection, 16% Llama-Prompt-Guard, 17% ProtectAI-Guard for base RL-Hammer. With LLM Judge reward, all four detectors drop to 0% while ASR remains at 97%.",
    404       "supported": "strong"
    405     },
    406     {
    407       "claim": "RL-Hammer achieves 99% ASR on GPT-4o and 97% on Claude-3.5-Sonnet for the AdvBench jailbreak task.",
    408       "evidence": "Table 4 reports these results. Jailbreak-R1 achieves 47% (GPT-4o) and 3% (Claude-3.5-Sonnet) at 10 attempts for comparison.",
    409       "supported": "moderate"
    410     },
    411     {
    412       "claim": "RL-Hammer achieves 51% ASR on AgentDojo against GPT-4o, approaching the 63% upper bound of direct benign instruction.",
    413       "evidence": "Table 3 reports 51% ASR vs 63% upper bound (direct benign instruction) on GPT-4o. However, results are on a 100-sample test set without error bars.",
    414       "supported": "moderate"
    415     }
    416   ],
    417   "red_flags": [
    418     {
    419       "flag": "No error bars or variance on any results",
    420       "detail": "All main results (Tables 1, 3, 4, 5, 7) are reported as single point estimates. The paper acknowledges that different seeds lead to different strategies (Section 5.2), meaning reported ASR numbers could vary substantially across runs. For a 100-sample test set, a 98% ASR has a 95% CI of roughly ±2.7pp by binomial approximation, yet no uncertainty is quantified."
    421     },
    422     {
    423       "flag": "≥80% ASR claim contradicted by own results",
    424       "detail": "Section 4.2 claims 'RL-Hammer attains ≥80% ASR on every evaluated target.' However, Claude-4-Sonnet achieves only 77% ASR even when trained directly on it (Table 1), contradicting the stated threshold."
    425     },
    426     {
    427       "flag": "Meta researchers evaluating Meta's own defense and attack",
    428       "detail": "Five of seven authors are affiliated with FAIR at Meta. They evaluate RL-Hammer (their attack) against Meta SecAlign (their defense) and commercial competitors. While showing their own defense is vulnerable suggests transparency, insider knowledge of SecAlign's architecture could advantage the attacker design."
    429     },
    430     {
    431       "flag": "No model version pinning for commercial APIs",
    432       "detail": "All commercial models are referenced by marketing names (GPT-4o, GPT-5, Claude-3.5-Sonnet, Gemini-2.5-Flash) without API version strings or snapshot dates. These models change over time, making results non-reproducible."
    433     },
    434     {
    435       "flag": "Training cost not quantified despite being a stated limitation",
    436       "detail": "The limitations section acknowledges training is costly and requires many queries to target models, but never quantifies the actual cost (API spend, GPU hours, total queries). This omission prevents assessing the practical threat level of the attack."
    437     },
    438     {
    439       "flag": "InjecAgent worst-case setup may inflate ASR",
    440       "detail": "The paper acknowledges InjecAgent restricts the agent to only two tools (benign + adversary's target), calling it 'a kind of worst-case testing scenario.' ASR on the more realistic AgentDojo (many tools) drops to 51%, suggesting the headline 98% figure may not reflect real-world conditions."
    441     }
    442   ],
    443   "cited_papers": [
    444     {
    445       "title": "SecAlign: Defending against prompt injection with preference optimization",
    446       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"],
    447       "year": 2024,
    448       "arxiv_id": "2410.05451",
    449       "relevance": "Major prompt injection defense via preference optimization, directly evaluated and shown to be breakable by RL-Hammer."
    450     },
    451     {
    452       "title": "The instruction hierarchy: Training LLMs to prioritize privileged instructions",
    453       "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"],
    454       "year": 2024,
    455       "arxiv_id": "2404.13208",
    456       "relevance": "Key prompt injection defense mechanism that RL-Hammer demonstrates can be bypassed on GPT models."
    457     },
    458     {
    459       "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents",
    460       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    461       "year": 2024,
    462       "arxiv_id": "2403.02691",
    463       "relevance": "Primary benchmark dataset used to evaluate prompt injection attacks on tool-using LLM agents."
    464     },
    465     {
    466       "title": "AgentDojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents",
    467       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"],
    468       "year": 2024,
    469       "relevance": "More realistic multi-tool prompt injection benchmark used for secondary evaluation of RL-Hammer."
    470     },
    471     {
    472       "title": "Universal and transferable adversarial attacks on aligned language models",
    473       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J Zico Kolter", "Matt Fredrikson"],
    474       "year": 2023,
    475       "arxiv_id": "2307.15043",
    476       "relevance": "GCG attack method used as a primary baseline and source of the AdvBench jailbreak benchmark."
    477     },
    478     {
    479       "title": "AdvPrompter: Fast adaptive adversarial prompting for LLMs",
    480       "authors": ["Anselm Paulus", "Arman Zharmagambetov", "Chuan Guo", "Brandon Amos", "Yuandong Tian"],
    481       "year": 2024,
    482       "arxiv_id": "2404.16873",
    483       "relevance": "Prior RL-based adversarial prompting approach that SecAlign was shown to be robust against, but RL-Hammer bypasses."
    484     },
    485     {
    486       "title": "Jailbreak-R1: Exploring the jailbreak capabilities of LLMs via reinforcement learning",
    487       "authors": ["Weiyang Guo", "Zesheng Shi", "Zhuo Li", "Yequan Wang", "Xuebo Liu", "Wenya Wang", "Fangming Liu", "Min Zhang", "Jing Li"],
    488       "year": 2025,
    489       "arxiv_id": "2506.00782",
    490       "relevance": "State-of-the-art RL-based jailbreak attack that RL-Hammer significantly outperforms on AdvBench."
    491     },
    492     {
    493       "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks",
    494       "authors": ["Dario Pasquini", "Martin Strohmeier", "Carmela Troncoso"],
    495       "year": 2024,
    496       "relevance": "Gradient-based prompt injection attack that SecAlign was previously shown to resist."
    497     },
    498     {
    499       "title": "Meta SecAlign: A secure foundation LLM against prompt injection attacks",
    500       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "David Wagner", "Chuan Guo"],
    501       "year": 2025,
    502       "arxiv_id": "2507.02735",
    503       "relevance": "Commercial-grade defense model (Meta SecAlign 8B/70B) that RL-Hammer achieves up to 99% ASR against."
    504     },
    505     {
    506       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    507       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    508       "year": 2023,
    509       "doi": "10.1145/3605764.3623985",
    510       "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications."
    511     },
    512     {
    513       "title": "HarmBench: A standardized evaluation framework for automated red teaming and robust refusal",
    514       "authors": ["Mantas Mazeika", "Long Phan", "Xuwang Yin", "Andy Zou"],
    515       "year": 2024,
    516       "relevance": "Standardized red-teaming evaluation framework; the HarmBench-Llama-2-13b-cls judge is used to evaluate jailbreak success."
    517     },
    518     {
    519       "title": "WASP: Benchmarking web agent security against prompt injection attacks",
    520       "authors": ["Ivan Evtimov", "Arman Zharmagambetov", "Aaron Grattafiori", "Chuan Guo", "Kamalika Chaudhuri"],
    521       "year": 2025,
    522       "arxiv_id": "2504.18575",
    523       "relevance": "Web agent security benchmark for prompt injection attacks from the same research group."
    524     },
    525     {
    526       "title": "Commercial LLM agents are already vulnerable to simple yet dangerous attacks",
    527       "authors": ["Ang Li", "Yin Zhou", "Vethavikashini Chithrra Raghuram", "Tom Goldstein", "Micah Goldblum"],
    528       "year": 2025,
    529       "arxiv_id": "2502.08586",
    530       "relevance": "Demonstrates vulnerability of commercial LLM agents to simple prompt injection attacks."
    531     }
    532   ],
    533   "engagement_factors": {
    534     "practical_relevance": {
    535       "score": 2,
    536       "justification": "Code is released and practitioners can use RL-Hammer to red-team their own LLM agent defenses, but requires significant ML infrastructure to train the attacker."
    537     },
    538     "surprise_contrarian": {
    539       "score": 3,
    540       "justification": "Directly contradicts the widely-held belief that Instruction Hierarchy and SecAlign provide robust prompt injection defenses, achieving 98% ASR against GPT-4o and 72% against GPT-5."
    541     },
    542     "fear_safety": {
    543       "score": 3,
    544       "justification": "Demonstrates a novel, automated attack that bypasses state-of-the-art defenses including GPT-5's Instruction Hierarchy and evades all tested prompt injection detectors."
    545     },
    546     "drama_conflict": {
    547       "score": 2,
    548       "justification": "Meta researchers showing that both their own SecAlign defense and competitors' defenses (OpenAI, Anthropic, Google) are fragile creates an implicit 'current defenses are an illusion' narrative."
    549     },
    550     "demo_ability": {
    551       "score": 2,
    552       "justification": "Code is publicly released at github.com/facebookresearch/rl-injector, but running it requires RL training infrastructure and API access to target models."
    553     },
    554     "brand_recognition": {
    555       "score": 2,
    556       "justification": "From FAIR at Meta, attacks GPT-4o/GPT-5 (OpenAI), Claude-3.5/4-Sonnet (Anthropic), and Gemini-2.5-Flash (Google) — all major brand-name models."
    557     }
    558   }
    559 }

Impressum · Datenschutz