ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (33319B)


      1 {
      2   "paper": {
      3     "title": "DRIP: Defending Prompt Injection via Token-wise Representation Editing and Residual Instruction Fusion",
      4     "authors": [
      5       "Ruofan Liu",
      6       "Yun Lin",
      7       "Zhiyong Huang",
      8       "Jin Song Dong"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv preprint",
     12     "arxiv_id": "2511.00447",
     13     "doi": "10.48550/arXiv.2511.00447"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "DRIP proposes a two-component defense against prompt injection: a token-wise representation editing layer that shifts data token embeddings away from the instruction manifold, and a residual instruction fusion pathway that anchors the output to the intended instruction. Evaluated on SEP, AlpacaFarm, and InjecAgent benchmarks against StruQ, SecAlign, ISE, and PFT baselines on LLaMA-8B and Mistral-7B, DRIP achieves 0% ASR on all heuristic-based attacks and reduces GCG adaptive attack ASR to 1–3%, while maintaining utility comparable to the undefended model (83.89% vs 85.37% on AlpacaEval 2.0 for LLaMA-8B).",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "An anonymous code repository is provided at https://anonymous.4open.science/r/PromptInjection-BD09 (footnote 1, Open Science section). The paper states 'All the documents and installation guidance are available.' However, this is an anonymous review link and long-term availability is uncertain."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Evaluation uses publicly available benchmarks: SEP (Zverev et al.), AlpacaFarm (Dubois et al.), InjecAgent (Zhan et al.), AlpacaEval 2.0, IFEval, and MT-Bench. Training data is curated from the public SEP training split and SQuAD. The curated DPO training data itself may be in the code repository."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Section 3.4 mentions hardware (6 NVIDIA RTX 5880 GPUs, 48GB each) and training hyperparameters, but no software dependencies, Python version, library versions, requirements.txt, or Dockerfile are provided in the paper."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper describes the training and evaluation setup conceptually (Sections 3.4, 4.1) but provides no step-by-step reproduction instructions, commands to run, or a 'Reproducing Results' section. The anonymous repo may contain a README but this cannot be verified from the paper."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "All results in Tables 3–7 are reported as point estimates only. No confidence intervals, error bars, or ± notation appear anywhere in the paper."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims DRIP 'improves role separation score by 12–49%' and 'reduces attack success rate by over 66%' but provides no p-values, t-tests, or any statistical significance tests to support these comparisons."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Results are reported with baseline context throughout. For example, Table 3 shows DRIP at 80.9% vs SecAlign 31.9% SEP on LLaMA-8B. Table 5 shows GCG ASR of 1.06% vs 66.67% for SecAlign. The reader can assess the magnitude of improvements from the absolute numbers."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Benchmark sizes are stated (SEP: 9,160 tuples, AlpacaFarm: 208 examples after filtering, InjecAgent: 1,054 test cases) but no justification is given for why these sizes are adequate, especially the filtered AlpacaFarm subset of only 208 examples."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No standard deviations, variance, or spread measures are reported. All results appear to be from single experimental runs with no indication of multiple seeds or runs."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Five baselines are compared: Undefended, StruQ, SecAlign, ISE, and PFT (Section 4.1.2). All are evaluated on the same benchmarks under the same conditions."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "All baselines are from 2024–2025: StruQ (2024), SecAlign (2024), ISE (2024), PFT (2024/2025). These represent the current state-of-the-art in training-time prompt injection defenses."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 4.4 presents a thorough ablation study (Table 7) testing removal of Case 2, Case 3, replacing token-wise editing with embedding shift, using concat vs sum fusion, and removing fusion entirely. Each variant modifies one component while keeping others fixed."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Multiple metrics are used: SEP score for role separation, ASR for attack success (under heuristic and optimization-based attacks), and three utility metrics (AlpacaEval 2.0 win%, IFEval accuracy%, MT-Bench scores across 8 axes)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "All evaluations are automated: SEP uses witness string detection, ASR uses string matching, AlpacaEval and MT-Bench use LLM-as-judge (GPT-4), and IFEval uses rule-based checks. No human evaluation of defense quality is conducted."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Training uses the SEP training split (10k tuples, Section 3.2), while evaluation uses the separate SEP evaluation benchmark (9,160 tuples, Section 4.1.1). AlpacaFarm and InjecAgent are entirely separate benchmarks not used in training."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 5 (and full Table 8) breaks down ASR by attack family (Naive, Ignore variants 0–10, Completion variants, Escape variants, HackaPrompt, GCG, NeuralExec). Figure 8 shows MT-Bench scores across 8 skill categories."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 4.5.1 discusses a failure case where DRIP produces a 'semantic echo' — the model avoids direct execution of the injected instruction but integrates the injected concept ('sleep') into an open-ended pun task (Figure 10)."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The ablation study (Table 7) reports configurations that degrade performance: removing Case 2 drops SEP by 22.4%, concat fusion drops utility by 13.75%, removing fusion spikes GCG ASR to 62.8%. Section 4.5.2 shows test-time defenses that degrade utility (Fake Completion)."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims of 12–49% SEP improvement are supported by Table 3 (80.9% vs 31.9% on LLaMA, 70.7% vs 58.6% on Mistral). The 66% ASR reduction claim is supported by Table 5 (GCG: 1.06% vs 66.67%). Utility 'on par with undefended' is supported by Table 6 (83.89% vs 85.37% AlpacaEval)."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims like 'DRIP improves role separation' are supported by ablation studies (Table 7) that isolate individual components through controlled single-variable manipulation. Each ablation modifies one design element while keeping others fixed."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The abstract specifies 'on LLaMA-8B and Mistral-7B across three prompt injection benchmarks.' Section 4.5.4 explicitly bounds scope: only 7–8B models, single-turn settings, text-only attacks. Model scale, multi-turn, and multimodal limitations are all stated."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "While the ablation study isolates component contributions, the paper does not discuss alternative explanations for the observed improvements. For example, it does not consider whether the DPO training alone (without architectural changes) or the data curation improvements might account for gains, or whether the additional parameters (0.21%) could confound comparisons."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper measures ASR and SEP score on specific benchmarks and frames these as measuring 'robustness against prompt injection' generally. No discussion of whether benchmark injection scenarios reflect real-world prompt injection threats, or whether the witness-string-based SEP metric captures the full spectrum of injection success."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper refers to 'LLaMA-8B' and 'Mistral-7B' without specifying exact model versions or checkpoints. Reference [19] points to 'The Llama 3 herd of models' but no specific snapshot (e.g., Meta-Llama-3-8B-Instruct) is given. Reference [25] for Mistral also lacks a specific version."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Figure 11 provides the full prompt used for training response generation via GPT-4o. Figure 12 provides the full auditor prompt for response verification. Evaluation follows standard benchmark protocols whose prompts are publicly available."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 3.4 reports LoRA rank r=16, α=8, dropout=0.05, one epoch, global batch size 24, learning rate 1×10⁻⁴. Hardware: 6 NVIDIA RTX 5880 GPUs with 48GB each. GCG suffix length of 20 tokens is stated (Table 2/Figure 2)."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. DRIP is a training-time defense applied to standard LLM architectures."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 3.2 documents the data curation pipeline: starting from SEP training split (10k tuples), discarding original injected tasks, resampling from SQuAD, generating responses with GPT-4o, applying XML-tagging sanitization and LLM-as-judge auditing. The iterative refinement process is described with the specific strategies used."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 4.5.4 'Future Work' serves as a limitations section, discussing three specific limitations: model scale (7–8B only), single-turn evaluation only, and text-only modality. Section 4.5.1 also discusses a failure mode."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 4.5.4 identifies threats specific to this study: 'All experiments in this work are conducted on open-source models in the 7B–8B parameter range... primarily due to computational and training resource constraints,' 'designed and evaluated in single-turn settings,' and lack of multimodal evaluation."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 4.5.4 explicitly states what was NOT tested: larger models ('13B or 34B'), multi-turn interactions ('multi-turn reasoning and memory'), and multimodal attacks ('vision-language models'). The threat model (Section 2) bounds the attack surface to indirect text injection."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The curated DPO training data (GPT-4o generated responses, audited pairs) is not independently available for verification. Evaluation benchmarks are public, but the core training data artifact that drives DRIP's performance is not verifiable."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 3.2 describes the data curation process in detail: source (SEP training split, SQuAD), resampling strategy, GPT-4o response generation with specific prompts (Figure 11), XML-tagging sanitization, and LLM-as-judge auditing (Figure 12)."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Data sources are standard public benchmarks (SEP from SQuAD/Alpaca, AlpacaFarm, InjecAgent)."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Figure 4 visualizes the full data curation pipeline: DPO pair construction → GPT-4o response generation → LLM-as-judge auditing → iterative refinement. Section 3.2 describes each step including sanitization strategies (response integrity and response utility)."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No acknowledgments section, funding statement, or grant information appears anywhere in the paper."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly listed: National University of Singapore and Shanghai Jiao Tong University. These are academic institutions with no obvious commercial stake in the evaluated methods."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure means this criterion cannot be satisfied."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests statement or financial disclosure appears in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "This paper tests a defense mechanism against prompt injection, not a pre-trained model's knowledge or capability on a benchmark. Contamination of the base model's training data is not relevant to evaluating defense effectiveness."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "This paper evaluates defenses against injection attacks, not model knowledge. The relevant train/test separation (SEP training split vs evaluation set) is maintained by design."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "Benchmark contamination (model having seen test tasks during pre-training) is not the concern here — the paper evaluates whether injected instructions are executed, which tests defense behavior rather than model knowledge."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study. All evaluation is automated using benchmark datasets."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. The Ethical Considerations section confirms: 'This work does not involve human subjects, personally identifiable information, or any sensitive user data.'"
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No inference cost, latency, or per-example timing is reported. Section 3.5 discusses parameter efficiency (0.21% additional parameters) but does not report wall-clock inference time or throughput."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Hardware is specified (6 NVIDIA RTX 5880 GPUs, 48GB each) and training runs for 1 epoch, but total GPU hours, training time, or API costs for GPT-4o data generation are not quantified."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No mention of multiple random seeds. All results appear to be from single experimental runs with no seed sensitivity analysis."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The number of experimental runs is never stated. Results are presented without indicating how many runs produced them."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No hyperparameter search is described. The ablation study tests architectural variants but does not report any systematic hyperparameter search budget for the chosen configuration."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The default configuration (linear shift + sum fusion + curated DPO data) is presented as the final method. While ablations show this combination performs well, the selection process for hyperparameters like LoRA rank, learning rate, and batch size is not described."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper makes numerous comparisons across 5 baselines, 7+ attack types, 2 models, and 3+ benchmarks without any statistical testing, let alone correction for multiple comparisons."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors re-implement all four baselines (StruQ, SecAlign, ISE, PFT) but do not acknowledge the potential bias of evaluating their own system against their own re-implementations of competing methods."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "Section 3.5 notes that DRIP adds only 0.21% parameters, but no comparison of training or inference compute across methods is provided. It is unclear whether DRIP and baselines use comparable compute budgets."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper does not discuss whether SEP's witness-string-based evaluation, AlpacaFarm's exact-match 'hacked' criterion, or InjecAgent's API-call detection actually capture real-world prompt injection risk. No discussion of construct validity for any benchmark."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is involved. DRIP modifies model architecture and training; comparisons are at the model level, not scaffold level."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the base models (LLaMA-8B, Mistral-7B) were pre-trained on data that includes SEP, AlpacaFarm, or InjecAgent benchmark content, which could affect the baseline performance levels."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the evaluation setup leaks information. For example, the SEP evaluation uses witness strings that the model may recognize from training data patterns."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "The SEP training and evaluation sets are from the same benchmark suite (both use SQuAD-based tasks). No analysis of structural similarity or potential overlap between training and evaluation data."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method (canary strings, membership inference, decontamination) is applied."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "DRIP improves role separation score by 12–49% over existing defenses on the SEP benchmark.",
    370       "evidence": "Table 3: DRIP achieves 80.9% SEP on LLaMA-8B (vs SecAlign 31.9%, a 49pp improvement) and 70.7% on Mistral-7B (vs SecAlign 58.6%, a 12.1pp improvement).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "DRIP reduces attack success rate by over 66% for GCG adaptive attacks compared to existing defenses.",
    375       "evidence": "Table 5: Under GCG attack, DRIP achieves 1.06% ASR on LLaMA-8B vs SecAlign's 66.67%, and 3.37% on Mistral-7B vs SecAlign's 98.56%.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "DRIP achieves 0% ASR on all heuristic-based prompt injection attacks across both models.",
    380       "evidence": "Table 5: All heuristic attack families (Naive, Ignore, Completion, Escape, HackaPrompt) show 0.00% ASR for DRIP on both LLaMA-8B and Mistral-7B.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "DRIP maintains instruction-following utility comparable to the undefended model.",
    385       "evidence": "Table 6: AlpacaEval 2.0 win% is 83.89% (DRIP) vs 85.37% (Undefended) on LLaMA-8B. IFEval accuracy is actually higher for DRIP: 76.02% vs 72.66%. MT-Bench (Figure 8) shows DRIP closely tracks the undefended model.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "The de-instruction shift selectively applies stronger representation edits to instruction-like tokens while leaving neutral tokens largely unchanged.",
    390       "evidence": "Figure 7 visualizes token-wise ℓ2 shift magnitudes showing elevated shifts at boundary tokens and adversarial phrases ('ignore', 'disregard'). Figure 6 shows T-SNE visualization of linearly separable manifolds after editing.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Sum fusion provably halves the worst-case logit sensitivity to suffix perturbations compared to an undefended decoder.",
    395       "evidence": "Theorem 2 (Appendix B) provides a formal proof that sum fusion's Lipschitz constant is half the undefended decoder's. However, empirical validation of the bound's tightness is not provided.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "Concat fusion significantly harms utility compared to sum fusion.",
    400       "evidence": "Table 7 ablation: concat fusion achieves 70.14% utility vs sum fusion's 83.89% on AlpacaEval 2.0, a 13.75pp degradation. Theorem 4 (Appendix C) provides information-theoretic justification.",
    401       "supported": "strong"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "No error bars or variance across runs",
    407       "detail": "All experimental results are reported as single-point estimates with no standard deviation, confidence intervals, or indication of multiple runs. Given that DPO training and LLM fine-tuning are sensitive to random seeds, this omission makes it impossible to assess result stability."
    408     },
    409     {
    410       "flag": "No statistical significance testing",
    411       "detail": "Claims of improvement (e.g., '12–49% improvement') are based entirely on comparing point estimates. With no significance tests, it is unclear whether observed differences are statistically meaningful or within natural variation."
    412     },
    413     {
    414       "flag": "Self-comparison bias in baseline re-implementation",
    415       "detail": "The authors re-implement all four baselines (StruQ, SecAlign, ISE, PFT) themselves. Lucic et al. (2018) showed that authors' re-implementations of baselines systematically underperform the original, yet this bias is not acknowledged or mitigated."
    416     },
    417     {
    418       "flag": "Small evaluation subset for AlpacaFarm",
    419       "detail": "The AlpacaFarm evaluation uses only 208 examples after filtering from 805 original prompts (Section 4.1.1). This small size combined with the lack of error bars raises questions about the stability of the reported ASR numbers."
    420     },
    421     {
    422       "flag": "Training data generated by model vulnerable to the studied attack",
    423       "detail": "Ground-truth DPO training responses are generated by GPT-4o, which the paper acknowledges is itself vulnerable to prompt injection (Section 3.2). While mitigated by XML-tagging and LLM auditing, this introduces a bootstrapping concern about training data quality."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "StruQ: Defending against prompt injection with structured queries",
    429       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    430       "year": 2024,
    431       "arxiv_id": "2402.06363",
    432       "relevance": "Key baseline defense against prompt injection using adversarial training with structured delimiters and SFT objective."
    433     },
    434     {
    435       "title": "SecAlign: Defending against prompt injection with preference optimization",
    436       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"],
    437       "year": 2024,
    438       "arxiv_id": "2410.05451",
    439       "relevance": "Strongest prior baseline using DPO-based contrastive training for prompt injection defense."
    440     },
    441     {
    442       "title": "Instructional segment embedding: Improving LLM safety with instruction hierarchy",
    443       "authors": ["Tong Wu", "Shujian Zhang", "Kaiqiang Song"],
    444       "year": 2024,
    445       "arxiv_id": "2410.09102",
    446       "relevance": "Architectural defense using segment-type embeddings to distinguish instruction and data spans."
    447     },
    448     {
    449       "title": "Universal and transferable adversarial attacks on aligned language models",
    450       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J. Zico Kolter", "Matt Fredrikson"],
    451       "year": 2023,
    452       "arxiv_id": "2307.15043",
    453       "relevance": "GCG attack method used as the primary optimization-based adaptive attack in evaluation."
    454     },
    455     {
    456       "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks",
    457       "authors": ["Dario Pasquini", "Martin Strohmeier", "Carmela Troncoso"],
    458       "year": 2024,
    459       "arxiv_id": "2403.03792",
    460       "relevance": "NeuralExec attack method for learning universal adversarial execution triggers for prompt injection."
    461     },
    462     {
    463       "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents",
    464       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    465       "year": 2024,
    466       "arxiv_id": "2403.02691",
    467       "relevance": "Agentic prompt injection benchmark with tool-based interactions used for evaluation."
    468     },
    469     {
    470       "title": "Can LLMs separate instructions from data? And what do we even mean by that?",
    471       "authors": ["Egor Zverev", "Sahar Abdelnabi", "Soroush Tabesh", "Mario Fritz", "Christoph H. Lampert"],
    472       "year": 2025,
    473       "arxiv_id": "2403.06833",
    474       "relevance": "SEP benchmark defining the instruction-data separation evaluation framework used as the primary benchmark."
    475     },
    476     {
    477       "title": "ASIDE: Architectural separation of instructions and data in language models",
    478       "authors": ["Egor Zverev", "Evgenii Kortukov", "Alexander Panfilov"],
    479       "year": 2025,
    480       "arxiv_id": "2503.10566",
    481       "relevance": "Related architectural approach imposing orthogonality between latent representations of instruction and data."
    482     },
    483     {
    484       "title": "Meta SecAlign: A secure foundation LLM against prompt injection attacks",
    485       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "David Wagner", "Chuan Guo"],
    486       "year": 2025,
    487       "arxiv_id": "2507.02735",
    488       "relevance": "Extension of SecAlign to foundation model level for prompt injection defense."
    489     },
    490     {
    491       "title": "Melon: Provable defense against indirect prompt injection attacks in AI agents",
    492       "authors": ["Kaijie Zhu", "Xianjun Yang", "Jindong Wang", "Wenbo Guo", "William Yang Wang"],
    493       "year": 2025,
    494       "arxiv_id": "2502.05174",
    495       "relevance": "Provable defense against indirect prompt injection in agentic settings."
    496     },
    497     {
    498       "title": "Fath: Authentication-based test-time defense against indirect prompt injection attacks",
    499       "authors": ["Jiongxiao Wang", "Fangzhou Wu", "Wendi Li"],
    500       "year": 2024,
    501       "arxiv_id": "2410.21492",
    502       "relevance": "Inference-time defense using authentication and hashing for retrieved content integrity."
    503     },
    504     {
    505       "title": "Are you still on track!? Catching LLM task drift with activations",
    506       "authors": ["Sahar Abdelnabi", "Aideen Fay", "Giovanni Cherubin", "Ahmed Salem", "Mario Fritz", "Andrew Paverd"],
    507       "year": 2024,
    508       "arxiv_id": "2406.00799",
    509       "relevance": "Detection-based defense monitoring activation shifts to detect prompt injection via task drift."
    510     },
    511     {
    512       "title": "Defending against indirect prompt injection attacks with spotlighting",
    513       "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"],
    514       "year": 2024,
    515       "arxiv_id": "2403.14720",
    516       "relevance": "Inference-time defense using trusted-region encoding to mitigate indirect injection."
    517     },
    518     {
    519       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    520       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    521       "year": 2024,
    522       "relevance": "Formalization of prompt injection attack taxonomy and benchmark used to define attack categories in the evaluation."
    523     },
    524     {
    525       "title": "The illusion of role separation: Hidden shortcuts in LLM role learning (and how to fix them)",
    526       "authors": ["Zihao Wang", "Yibo Jiang", "Jiahao Yu", "Heqing Huang"],
    527       "year": 2025,
    528       "arxiv_id": "2505.00626",
    529       "relevance": "Analysis of shortcut learning in LLM role separation defenses, directly relevant to PFT baseline."
    530     }
    531   ],
    532   "engagement_factors": {
    533     "practical_relevance": {
    534       "score": 2,
    535       "justification": "DRIP provides a concrete, implementable defense with code released, but requires fine-tuning open-source LLMs — not immediately deployable without ML engineering effort."
    536     },
    537     "surprise_contrarian": {
    538       "score": 1,
    539       "justification": "The representation editing framing is novel but the overall direction of training-time prompt injection defense is well-established; results extend rather than challenge the field."
    540     },
    541     "fear_safety": {
    542       "score": 2,
    543       "justification": "Demonstrates that existing defenses (including SecAlign) have 66–98% ASR under adaptive GCG attacks, raising concerns about deployed prompt injection defenses."
    544     },
    545     "drama_conflict": {
    546       "score": 0,
    547       "justification": "No controversy or conflict framing; straightforward technical defense paper."
    548     },
    549     "demo_ability": {
    550       "score": 1,
    551       "justification": "Anonymous code repository exists and an anonymous demo website is referenced, but requires GPU resources and fine-tuning to actually try."
    552     },
    553     "brand_recognition": {
    554       "score": 0,
    555       "justification": "From NUS and SJTU — respected academic institutions but not headline AI labs."
    556     }
    557   }
    558 }

Impressum · Datenschutz