ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27943B)


      1 {
      2   "paper": {
      3     "title": "ChatInject: Abusing Chat Templates for Prompt Injection in LLM Agents",
      4     "authors": ["Hwan Chang", "Yonghyun Jun", "Hwanhee Lee"],
      5     "year": 2025,
      6     "venue": "ICLR 2026",
      7     "arxiv_id": "2509.22830"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The abstract states 'The code is available at https://github.com/hwanchang00/ChatInject' — a working GitHub URL is provided."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses two publicly available benchmarks: AgentDojo (Debenedetti et al., 2024) and InjecAgent (Zhan et al., 2024). These are standard public benchmarks. The generated multi-turn dialogues would be in the released code repository."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is mentioned. Appendix D states models were accessed via OpenRouter API with temperature 0 and lists providers, but no local environment dependencies are specified."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The Reproducibility Statement references Sections 3.2, 3.3, and Appendix D for experimental details, but no step-by-step reproduction instructions (README with commands, scripts to run) are described in the paper itself."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "95% Wilson confidence intervals are reported in Appendix E (Tables 10 and 11) for all main results, and error bars are shown in Figures 3, 4, 5, and 6."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims ChatInject 'achieves significantly higher average attack success rates' but does not report any formal significance tests (p-values, t-tests, etc.). Comparisons are based on point estimates and confidence intervals, but no statistical hypothesis testing is performed."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper consistently reports absolute differences with baselines. For example, 'improving from 5.18% to 32.05% on AgentDojo and from 15.13% to 45.90% on InjecAgent.' Tables include colored deltas showing changes relative to baseline (e.g., '+30.9', '-22.6')."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for the benchmark sizes (InjecAgent has 1054 samples, AgentDojo has 389). There is no power analysis or discussion of whether these sizes are sufficient for the claims made."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "For experiments involving random seeds (MoT in Section 5.3, perturbation experiments in Section 6.2), mean ± std are reported across five random seeds. Figure 4 shows 'mean ± std' and Figure 6 shows 'mean ± std across five seeds'."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares against Default InjecPrompt (standard plain-text injection) and Default Multi-turn as baselines, as well as benign utility (no attack) in Table 4."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The baselines use state-of-the-art injection methods from Debenedetti et al. (2024) and recent defense mechanisms including Lakera Guard and ProtectAI detectors. The benchmarks (AgentDojo, InjecAgent) are contemporary."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper systematically ablates components: comparing plain-text vs. template formatting, single-turn vs. multi-turn, reasoning hooks vs. tool-calling hooks, different persuasion techniques (Section C.1), number of dialogue turns (Figure 7a), and template perturbation types (Section 6.2)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Two key metrics are used: Attack Success Rate (ASR) measuring attack effectiveness, and Utility under Attack measuring the agent's ability to complete legitimate tasks. Both are reported throughout."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of attack outputs is performed. All evaluation is automated via benchmark scoring (InjecAgent and AgentDojo procedures). The paper makes subjective quality claims about generated multi-turn dialogues but only describes manual review of dialogue generation quality, not of attack outcomes."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The evaluation uses two established benchmarks (AgentDojo and InjecAgent) which serve as standardized test sets. The attack payloads are constructed independently from the evaluation sets."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per model (9 models), per benchmark (AgentDojo and InjecAgent), per attack variant (4 variants + reasoning/tool hooks), per template (7 templates in transfer experiments), and per persuasion technique (Figure 7b)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses models where attacks are less effective: Grok-2 shows 'only minor ASR gains under ChatInject' due to weak role delimiters (Section 4.1), GPT-oss shows low transferability, and Llama-4 has low benign utility. Section C.4 analyzes why homoglyph encoding fails."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Several negative results are reported: foreign templates on InjecAgent often fall below Default InjecPrompt ASR (Table 2), homoglyph-encoded templates yield very low ASR (Table 6), and some defense mechanisms do reduce ASR for certain variants (Figure 5)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The three abstract claims — (1) higher ASR (5.18%→32.05% on AgentDojo, 15.13%→45.90% on InjecAgent), (2) cross-model transferability including closed-source models, (3) existing defenses are largely ineffective — are all supported by Tables 1, 2, and Figure 5 respectively."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims about chat templates granting authority to payloads. These are supported by controlled experiments: the only variable changed between Default InjecPrompt and ChatInject is the template formatting, with all other factors held constant. The attention analysis in Section C.3 (Table 5) provides mechanistic evidence. Ablation studies isolate individual components."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper bounds its claims to the tested benchmarks and models. Limitations in Appendix B acknowledge synthetic dialogue limitations, limited internal analysis due to resource constraints, and defense limitations. The title specifically says 'Chat Templates' rather than making broader claims."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses alternative explanations: Grok-2's robustness may be due to weak role delimiters (Section 4.1), GPT-oss Utility collapse is attributed to tool-call detours (Section 4.1), and the attention analysis in Section C.3 provides mechanistic explanation. The homoglyph experiment (C.4) tests whether tokenization rather than template structure explains the effect."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper lists model names like 'Qwen3-235B-A22B', 'GPT-oss-120b', 'Llama-4-Maverick', 'GLM-4.5', 'Kimi-K2', 'Grok-2', 'GPT-4o', 'Grok-3', 'Gemini-2.5-Pro', but does not provide specific API snapshot dates or version identifiers. 'GPT-4o' is listed without a version date. These are marketing names without snapshot dates."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full prompt text is provided: the multi-turn dialogue generation prompt in Table 12, the attention-grabbing prefix in Section D.1, defense prompts in Tables 15 and 16, and payload examples in Tables 13-14, 17-18. The actual chat template tokens used are listed in Tables 19-20."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Appendix D states: 'All models were accessed via the OpenRouter (2025) API with temperature set to 0 (greedy decoding).' The perturbation ratio is specified as 0.1 (Section D.5), and 7 turns are used for multi-turn dialogues (Section 3.2). Five random seeds are used for stability experiments."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The agentic scaffolding is described in detail: Section 3.1 formalizes the agent pipeline with tool calls, Section 4.2 describes reasoning and tool-calling hooks with specific token formats, and the benchmarks (AgentDojo, InjecAgent) define the agent framework used."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.2 describes how multi-turn dialogues are generated using GPT-4.1 with specific prompts (Table 12) and manually reviewed. Section D.2 describes the review criteria. The benchmarks used (AgentDojo and InjecAgent) are standard and unmodified."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Appendix B is titled 'Limitations and Future Work' and provides substantive discussion across three categories: Synthetic Multi-turn Generation, Limited Internal Analysis, and Defense Limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The limitations section discusses specific threats: synthetic dialogues 'may not capture real-world persuasive conversation diversity', 'resource constraints prevented detailed attention analysis', and existing defenses 'incur significant trade-offs: longer prompts, additional runtime processing, and high false positive rates.'"
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "While limitations are discussed, the paper does not explicitly state what the results do NOT show. There is no explicit statement bounding claims to specific settings (e.g., 'our results do not demonstrate effectiveness against models with special token filtering' or 'we did not test on real-world deployed systems'). The limitations are framed as future work rather than scope boundaries."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Raw experimental logs (individual model outputs, per-example attack results) are not made available. Only aggregate statistics (ASR percentages, Utility percentages) are reported. The generated multi-turn dialogues are presumably in the GitHub repo but this is not explicitly stated."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3.2 describes how multi-turn dialogues were generated using GPT-4.1 with specific prompts. Section 3.3 describes benchmark selection. Appendix D provides API access details, providers, and configuration."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved. The study evaluates automated attacks against LLM agents using standard benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is documented: (1) generate multi-turn dialogues with GPT-4.1 (Section 3.2), (2) manually review for quality (Section D.2), (3) format payloads with templates (Section 3.2), (4) inject into benchmark scenarios, (5) evaluate with benchmark-specific metrics (Section 3.3). Template construction details are in Appendix D.1."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The Acknowledgement section discloses two funding sources: IITP grant funded by Korean government (MSIT) for AI Graduate School Program at Chung-Ang University, and NRF grant funded by Korean government (MSIT)."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are affiliated with Department of Artificial Intelligence, Chung-Ang University. The paper evaluates models from various companies (OpenAI, Google, xAI, etc.) without any author affiliation to those companies."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funding comes from Korean government research agencies (IITP and NRF), which have no financial stake in whether the attack method succeeds or fails against any particular model."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper. Absence of disclosure is not the same as absence of conflict."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper tests attack methods against LLM agents, not the models' knowledge on benchmarks. The benchmarks (AgentDojo, InjecAgent) test agent behavior in response to injected payloads, not memorized knowledge. Contamination of attack scenarios is not a relevant concern."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "The study evaluates attack effectiveness against LLM agents, not model knowledge on benchmarks. Whether models have seen the benchmark data does not affect the validity of attack success rate measurements, since the attack exploits structural template vulnerabilities rather than knowledge gaps."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Same reasoning as above: the study tests security vulnerabilities (prompt injection via chat templates), not model capability on knowledge benchmarks. Contamination is structurally irrelevant."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study. All experiments are automated evaluations of attack methods against LLM agents."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants. The Ethics Statement discusses responsible research design and defensive intent but does not mention IRB approval because none is needed."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No API costs, token counts, or wall-clock time are reported for running the experiments. The paper accesses 9 frontier models via API but does not quantify the cost."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget, API spend, or hardware details are provided. The paper mentions 'resource constraints' in the limitations but does not quantify the resources used."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "ChatInject achieves significantly higher attack success rates than traditional prompt injection, improving from 5.18% to 32.05% on AgentDojo and from 15.13% to 45.90% on InjecAgent.",
    286       "evidence": "Table 1 shows ASR results across 6 open-source models on both benchmarks. Multi-turn + ChatInject achieves average 52.33% ASR on InjecAgent. Results with 95% confidence intervals in Appendix E (Tables 10-11).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Chat-template-based payloads demonstrate strong transferability across models, remaining effective against closed-source LLMs with unknown template structures.",
    291       "evidence": "Table 2 shows cross-model transfer results. OS templates injected into CS models (GPT-4o, Grok-3, Gemini-pro) generally raise ASR above Default InjecPrompt. Qwen-3 template averages 29.6% ASR on CS models on InjecAgent vs. 4.4% default.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Template similarity predicts attack transferability: higher similarity between injected and target templates leads to higher ASR.",
    296       "evidence": "Figure 3 shows clear correlation between template embedding similarity and ASR on AgentDojo. Section 5.1 describes the embedding similarity methodology using lighter-weight models from the same families.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Existing prompt-based defenses are largely ineffective against ChatInject, especially Multi-turn variants.",
    301       "evidence": "Figure 5 shows defense evaluation on Qwen-3 and Grok-3. Prompt-based defenses (instructional prevention, delimiting, repeat user) show higher ASR for ChatInject and Multi-turn variants compared to no-defense baselines. External detectors reduce ASR but degrade Utility significantly.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Mixture-of-Templates (MoT) is an effective attack strategy when the target model's backbone is unknown, yielding higher and more stable ASR than single-template attacks.",
    306       "evidence": "Figure 4 and Section 5.3 show MoT consistently exceeds Default InjecPrompt ASR across Qwen-3, GPT-oss, and Llama-4, with lower variance than single-template attacks across 5 random seeds.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "ChatInject variants remain effective even under template perturbations designed to defeat rule-based parsing.",
    311       "evidence": "Figure 6 shows that all perturbed variants (remove, replace, insert at 10% character rate) continue to outperform Default InjecPrompt and Default Multi-turn baselines across 3 models on AgentDojo.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "ChatInject exploits LLM chat templates to perform indirect prompt injection by forging role tags within tool outputs, causing models to misinterpret malicious payloads as higher-priority instructions. Across 9 frontier LLMs on two benchmarks, ChatInject raises attack success rates from single-digit baselines to 32-52% on average. The attack transfers across models, including closed-source ones with unknown templates, and a mixture-of-templates strategy provides robust performance when the target model is unknown. Existing prompt-based defenses are largely ineffective, and template perturbations defeat rule-based parsing countermeasures while preserving attack efficacy.",
    317   "red_flags": [
    318     {
    319       "flag": "No statistical significance testing",
    320       "detail": "The paper uses the word 'significantly' when describing ASR improvements but relies only on point estimates and confidence intervals without formal hypothesis tests. Some confidence intervals (especially on AgentDojo with N=389) are wide enough that apparent differences between conditions may not be statistically significant."
    321     },
    322     {
    323       "flag": "Defense evaluation limited to two models",
    324       "detail": "The defense evaluation in Section 6.1 is conducted only on Qwen-3 and Grok-3, despite the paper testing 9 models in other sections. This limits the generalizability of the defense ineffectiveness claim."
    325     },
    326     {
    327       "flag": "No cost reporting despite extensive API usage",
    328       "detail": "The paper accesses 9 frontier LLMs via API across thousands of benchmark instances with multiple attack variants, but reports no API costs, token counts, or computational budget, making it impossible to assess practical reproducibility."
    329     },
    330     {
    331       "flag": "Template similarity measured on proxy models",
    332       "detail": "Due to resource constraints, template similarity (Section 5.1) is measured using lighter-weight models (Qwen3-30B, GPT-oss-20B, Llama-4-Scout-17B) rather than the actual models used in experiments. The paper acknowledges this but the correlation between proxy similarity and actual model behavior may not hold."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents",
    338       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"],
    339       "year": 2024,
    340       "relevance": "Key benchmark used to evaluate indirect prompt injection attacks and defenses for LLM agents."
    341     },
    342     {
    343       "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents",
    344       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    345       "year": 2024,
    346       "doi": "10.18653/v1/2024.findings-acl.624",
    347       "relevance": "Key benchmark for assessing LLM agent robustness against indirect prompt injection attacks."
    348     },
    349     {
    350       "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions",
    351       "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"],
    352       "year": 2024,
    353       "arxiv_id": "2404.13208",
    354       "relevance": "Foundational work on instruction hierarchies that ChatInject exploits by forging role tags."
    355     },
    356     {
    357       "title": "Adaptive Attacks Break Defenses Against Indirect Prompt Injection Attacks on LLM Agents",
    358       "authors": ["Qiusi Zhan", "Richard Fang", "Henil Shalin Panchal", "Daniel Kang"],
    359       "year": 2025,
    360       "doi": "10.18653/v1/2025.findings-naacl.395",
    361       "relevance": "Demonstrates that adaptive attacks can bypass defenses for indirect prompt injection in LLM agents."
    362     },
    363     {
    364       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    365       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    366       "year": 2023,
    367       "relevance": "Foundational work defining indirect prompt injection in LLM-integrated applications."
    368     },
    369     {
    370       "title": "SecAlign: Defending Against Prompt Injection with Preference Optimization",
    371       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"],
    372       "year": 2025,
    373       "relevance": "Defense approach using preference optimization to protect against prompt injection attacks."
    374     },
    375     {
    376       "title": "Can LLMs Separate Instructions from Data? And What Do We Even Mean by That?",
    377       "authors": ["Egor Zverev", "Sahar Abdelnabi", "Soroush Tabesh", "Mario Fritz", "Christoph H. Lampert"],
    378       "year": 2025,
    379       "relevance": "Examines the fundamental challenge of LLMs distinguishing between data and instructions, core to prompt injection."
    380     },
    381     {
    382       "title": "Agent Security Bench (ASB): Formalizing and Benchmarking Attacks and Defenses in LLM-Based Agents",
    383       "authors": ["Hanrong Zhang", "Jingyuan Huang", "Kai Mei", "Yifei Yao", "Zhenting Wang", "Chenlu Zhan", "Hongwei Wang", "Yongfeng Zhang"],
    384       "year": 2025,
    385       "relevance": "Comprehensive benchmark for evaluating security of LLM-based agents including attack and defense formalization."
    386     },
    387     {
    388       "title": "AutoHijacker: Automatic Indirect Prompt Injection Against Black-Box LLM Agents",
    389       "authors": ["Xiaogeng Liu", "Somesh Jha", "Patrick McDaniel", "Bo Li", "Chaowei Xiao"],
    390       "year": 2025,
    391       "relevance": "Automated approach to generating indirect prompt injection attacks against black-box LLM agents."
    392     },
    393     {
    394       "title": "Foot-in-the-Door: A Multi-Turn Jailbreak for LLMs",
    395       "authors": ["Zixuan Weng", "Xiaolong Jin", "Jinyuan Jia", "Xiangyu Zhang"],
    396       "year": 2025,
    397       "arxiv_id": "2502.19820",
    398       "relevance": "Multi-turn jailbreak technique that ChatInject adapts for indirect prompt injection via chat template exploitation."
    399     },
    400     {
    401       "title": "X-Teaming: Multi-Turn Jailbreaks and Defenses with Adaptive Multi-Agents",
    402       "authors": ["Salman Rahman", "Liwei Jiang", "James Shiffer", "Genglin Liu", "Sheriff Issaka", "Md Rizwan Parvez", "Hamid Palangi", "Kai-Wei Chang", "Yejin Choi", "Saadia Gabriel"],
    403       "year": 2025,
    404       "relevance": "Multi-turn jailbreak framework using adaptive multi-agents, related to multi-turn attack strategies."
    405     },
    406     {
    407       "title": "ChatBug: A Common Vulnerability of Aligned LLMs Induced by Chat Templates",
    408       "authors": ["Fengqing Jiang", "Zhangchen Xu", "Luyao Niu", "Bill Yuchen Lin", "Radha Poovendran"],
    409       "year": 2024,
    410       "relevance": "Prior work on chat template vulnerabilities for jailbreaking LLMs; ChatInject extends this to indirect prompt injection."
    411     },
    412     {
    413       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    414       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik R Narasimhan", "Yuan Cao"],
    415       "year": 2023,
    416       "relevance": "Foundational framework for LLM agents combining reasoning and tool use, which ChatInject attacks exploit."
    417     },
    418     {
    419       "title": "Defending Against Indirect Prompt Injection Attacks with Spotlighting",
    420       "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"],
    421       "year": 2024,
    422       "arxiv_id": "2403.14720",
    423       "relevance": "Defense method using data delimiters to isolate external content from prompt injection, evaluated as a baseline defense."
    424     }
    425   ]
    426 }

Impressum · Datenschutz