scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32083B)
      1 {
      2   "paper": {
      3     "title": "TopicAttack: An Indirect Prompt Injection Attack via Topic Transition",
      4     "authors": ["Yulin Chen", "Haoran Li", "Yuexin Li", "Yue Liu", "Yangqiu Song", "Bryan Hooi"],
      5     "year": 2025,
      6     "venue": "Conference on Empirical Methods in Natural Language Processing",
      7     "arxiv_id": "2507.13686",
      8     "doi": "10.48550/arXiv.2507.13686"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "TopicAttack achieves >90% attack success rate in most configurations across 10 LLMs (small, large, and closed-source) and 3 datasets, significantly outperforming prior prompt injection methods. The method fabricates multi-turn conversational transitions that gradually shift the topic from benign content toward the injected instruction, reducing injection abruptness. Even under strong fine-tuning-based defenses (StruQ, SecAlign), TopicAttack maintains high ASR (e.g., 92% on SecAlign for Qwen2-7B) where all other attacks drop below 10%. Attention analysis shows TopicAttack shifts model attention from original to injected instructions, correlating with attack success.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states 'Code is publicly available at https://github.com/LukeChen-go/topicattack' in footnote 1."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available datasets: Inj-SQuAD and Inj-TriviaQA from Chen et al. (2025b), and InjectAgent from Zhan et al. (2024) released under the MIT License. The underlying QA datasets (SQuAD, TriviaQA) are public benchmarks."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Appendix A specifies: 'PyTorch 2.1.0, single NVIDIA H100 GPU, do_sample=false, max_new_tokens=256, max_length=8192.' Key generation parameters and framework version are provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. While code is released and implementation details are in Appendix A, there is no 'Reproducing Results' section or explicit workflow to replicate the experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Tables 1-11 are reported as point estimates (e.g., '87.89%' ASR) with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims TopicAttack 'significantly outperforms' baselines throughout, but no statistical significance tests (t-test, bootstrap, etc.) are reported. All comparisons are based on raw ASR value differences."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Tables report absolute ASR values for all methods under all conditions, providing clear context for the magnitude of differences. The text also contextualizes specific comparisons, e.g., 'TopicAttack achieves ASR of 90.67%... while other attacks are below 10%' (Section 5.3)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The datasets contain 900 samples each (Inj-SQuAD, Inj-TriviaQA) and 510 samples (InjectAgent), but no justification for these sizes or power analysis is provided."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be single-run evaluations with no indication of stability across runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Five prompt-engineering attack baselines (Naive, Ignore, Escape, Fakecom, Combined) and two gradient-based attacks (AutoDAN, GCG) are compared. Five defense methods (None, Sandwich, Spotlight, StruQ, SecAlign) are also evaluated."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include recent methods: Fakecom (Willison 2023), Combined (Liu et al. 2024b), GCG (Zou et al. 2023), AutoDAN (Zhu et al. 2023), and defenses StruQ (Chen et al. 2024a) and SecAlign (Chen et al. 2025a). These represent the current state of the art."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple ablation studies are conducted: reminding prompt removal (Table 9), identifier influence (Table 10), injection position (Table 11), and multi-turn dialogue scenarios (Table 7). Section 5.5 is dedicated to ablation experiments."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Only ASR (attack success rate) is used as the evaluation metric. While perplexity and attention ratios are analyzed for explanatory purposes (Section 5.6, Figures 2-3), these are not evaluation metrics."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "All evaluation is automated. Chatbot attacks are evaluated by checking if target content appears in the response; agent attacks by checking if the target tool was invoked. No human evaluation of attack quality or naturalness is performed."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No explicit dev/test split is described. The method's hyperparameters (m=5 transition turns, prompt templates) appear to be set without a separate validation set. It is unclear whether any design decisions were informed by the evaluation data."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by model size (small/large/closed-source), dataset (Inj-SQuAD/Inj-TriviaQA/InjectAgent), defense method (5 defenses), and scenario (chatbot/agent/multi-turn). Tables 1-7 provide comprehensive breakdowns."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The paper does not analyze cases where TopicAttack fails. For example, on Llama3-8B with SecAlign, ASR drops to 0.44% (Table 1), but no analysis of why the attack fails in these cases is provided."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 10 shows that changing identifiers on the Fakecom attack to TopicAttack's format actually decreases ASR in several configurations (e.g., Llama3-8B Inj-SQuAD drops from 84.67% to 55.44%). The paper notes 'changing the identifiers alone does not consistently improve ASR. In some settings, performance improves, while in others it decreases significantly.'"
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 'ASR over 90% in most cases, even when various defense methods are applied.' The tables show TopicAttack achieves >90% in the majority of configurations across chatbot and agent scenarios, supporting this claim. The 'state-of-the-art' claim is supported by TopicAttack achieving the highest ASR in nearly all tested configurations."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims about component contributions are supported by controlled ablation studies: reminding prompt removal (Table 9) and identifier changes (Table 10) use single-variable manipulation. The attention analysis (Section 5.6) provides a mechanistic explanation for why the method works."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title and claims frame the method broadly as 'An Indirect Prompt Injection Attack' without explicitly bounding generalization. While the evaluation covers 10 models and 3 datasets, the paper does not state what settings or models the results might NOT generalize to."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper attributes TopicAttack's success to smooth topic transitions and attention manipulation (Section 5.6) but does not consider alternative explanations, such as whether the longer context itself (more tokens from fabricated conversation) contributes to the effect, or whether the multi-turn format alone is sufficient."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "ASR directly measures attack success — whether the target content appears in the response (chatbot) or the target tool is invoked (agent). The paper's claims match the granularity of its measurements without broader framing beyond what was measured."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Open-source models are identified by specific release names (e.g., 'Llama3-8B-Instruct', 'Qwen2-7B-Instruct'), but closed-source models are given only as marketing names ('GPT-4o-mini', 'GPT-4o', 'GPT-4.1') without API version or snapshot dates. The schema requires specific versions for API-based models."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The transition construction prompts for both chatbot and agent scenarios are provided in full in Appendix C. The reminding prompt is given verbatim in Section 4.2. Case study examples showing complete constructed injections appear in Appendix D."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix A reports: 'do_sample=false, max_new_tokens=256, max_length=8192.' The number of transition turns is fixed at m=5 (Section 4.2). Key generation parameters are documented."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "TopicAttack is a prompt construction method, not an agentic system. No scaffolding, tool use, or workflow orchestration is part of the attack methodology itself."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 5.1 describes the datasets: Inj-SQuAD and Inj-TriviaQA are derived from SQuAD and TriviaQA with injected instructions for phishing, advertisement, and propaganda (900 samples each). InjectAgent uses the 'Direct Harm' scenario with 510 samples. The transition prompt construction process is described in Section 4.2 and Appendix C."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "A dedicated 'Limitations' section appears after the Conclusion, discussing three specific limitations of the work."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The Limitations section mentions study-specific concerns: inability to fine-tune models exceeding 70B parameters (limiting defense evaluation to prompt-based methods for large models), the need to design specific prompts for transition generation, and the inability to provide formal mathematical proof of the method's properties."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what the results do NOT show or what settings are excluded. There is no discussion of models, application types, or attack scenarios where TopicAttack might not work."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Only aggregate ASR percentages are reported in tables. No per-example attack results, model outputs, or raw data files are mentioned as available for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 5.1 describes the data sources: Inj-SQuAD and Inj-TriviaQA from Chen et al. (2025b) derived from two QA datasets with injected instructions, and InjectAgent from Zhan et al. (2024) under the 'Direct Harm' scenario. Sample sizes are stated (900 and 510)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants are involved. All data comes from standard public benchmarks (SQuAD, TriviaQA, InjectAgent)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The attack construction pipeline is documented in Section 4.2: benign content Tb is concatenated with a fabricated transition Tt (generated by GPT-4o with m=5 turns) and the injected instruction Iinj with a reminding prompt. The formula Tinj = Tb ⊕ Tt ⊕ Iinj is explicit."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The Acknowledgment section states: 'Dr. Haoran Li, JC STEM Early Career Research Fellow, supported by The Hong Kong Jockey Club Charities Trust.'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: National University of Singapore and HKUST. The authors evaluate third-party models (Meta's Llama, Alibaba's Qwen, OpenAI's GPT), not their own institution's products."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The Hong Kong Jockey Club Charities Trust is a charitable organization with no financial interest in prompt injection attack research outcomes."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper evaluates the effectiveness of prompt injection attacks, not model knowledge or capability on benchmarks. The models are tested for vulnerability to attacks, not for correct answers to questions."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same as above — the paper tests attack success (whether models follow injected instructions), not model knowledge. Train/test overlap in the traditional contamination sense does not apply."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "The paper evaluates attack transferability across models and defenses, not model performance on knowledge benchmarks. Contamination is structurally inapplicable."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in any experiments."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The Ethical Consideration section discusses adherence to ACM Code of Ethics and ACL Code of Conduct but no IRB is needed."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants are involved."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost is reported. The method requires GPT-4o API calls to generate transition prompts for each attack instance, but this cost is not quantified. Victim model inference costs are also unreported."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Appendix A mentions a single NVIDIA H100 GPU but does not state total GPU hours, API spend, or total runtime. The cost of generating 900+ transition prompts via GPT-4o is not quantified."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of random seeds or seed sensitivity analysis. Results appear to be single-run with do_sample=false (deterministic generation for open-source models), but no seed variation is tested."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The exact number of experimental runs is not stated. It is unclear whether results are from single runs or averaged."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The transition turn count is fixed at m=5 without reporting any search over alternatives. No hyperparameter search budget or method is described."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "Key design choices (m=5 turns, specific prompt templates, role identifiers) appear fixed without justification for why these configurations were selected or any systematic comparison of alternatives."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper makes hundreds of comparisons across 10 models, 5 defenses, 3 datasets, and 6+ attack methods, but no correction for multiple comparisons is applied."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement all baseline attack methods themselves but do not acknowledge potential self-comparison bias. Lucic et al. (2018) showed that authors' implementations of baselines systematically underperform."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "TopicAttack requires additional GPT-4o API calls to generate multi-turn transition prompts, making it computationally more expensive than simple baselines (Naive, Ignore, Escape). This cost disparity is not discussed or controlled for in the comparison."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper does not discuss whether ASR on these specific datasets (derived from SQuAD/TriviaQA with synthetic injected instructions) actually reflects real-world attack effectiveness against deployed LLM applications."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding confound exists — all attack methods are prompt-based and evaluated under identical model configurations. The comparison is between attack strategies, not between scaffolded systems."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The SQuAD and TriviaQA datasets are from 2016 and 2017 respectively, well before the training data of all tested models. Models may have seen these QA pairs during training, which could affect their behavior when the same content appears as 'retrieved data.' This is not discussed."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup could leak information about the expected response. For example, the benign data content directly answers the user's question, which could interact with the model's training on the same QA pairs."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether examples in Inj-SQuAD/Inj-TriviaQA share structural similarities that could inflate apparent generalization. Examples from the same QA dataset may share topical or structural patterns."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are applied. No analysis of whether models' familiarity with SQuAD/TriviaQA content affects attack outcomes."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "TopicAttack achieves state-of-the-art ASR over 90% in most configurations, outperforming all baseline attack methods.",
    365       "evidence": "Tables 1-6 show TopicAttack achieving the highest ASR across 10 models, 3 datasets, and 5 defense configurations. For example, on Qwen2-7B with Inj-SQuAD (Table 1), TopicAttack achieves 99.22% (None), 68.56% (Sandwich), 99.44% (Spotlight), 99.22% (StruQ), and 92.00% (SecAlign).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "TopicAttack maintains high effectiveness even under strong fine-tuning-based defenses (StruQ, SecAlign) where other attacks fail.",
    370       "evidence": "Tables 1-2 show that on StruQ/SecAlign defenses, baseline attacks drop to near 0% while TopicAttack achieves up to 98.67% (StruQ) and 92.00% (SecAlign) on Qwen2-7B. On Llama3.1-8B with SecAlign, TopicAttack achieves 90.67% vs <5% for all baselines.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "The reminding prompt is crucial for attack effectiveness, especially under the Sandwich defense.",
    375       "evidence": "Table 9 shows removing the reminding prompt causes ASR drops of 20-25 percentage points under Sandwich defense across models (e.g., 79.78% → 55.89% on Llama3-8B Inj-SQuAD, 72.67% → 46.78% on Llama3.1-8B Inj-TriviaQA).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "TopicAttack lowers the perplexity of the injected instruction, indicating smoother topic integration.",
    380       "evidence": "Figure 2 shows the average log perplexity distribution of injected instructions: TopicAttack consistently has lower perplexity than Fakecom and Naive across Llama3-8B, Llama3.1-8B, and Qwen2-7B.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Higher attention ratio on injected vs. original instructions correlates with higher attack success, and TopicAttack achieves the highest ratio.",
    385       "evidence": "Figure 3 shows the distribution of injected/original attention score ratios under No Defense, StruQ, and SecAlign. TopicAttack achieves substantially higher ratios than Fakecom and Naive across all three defense settings.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "TopicAttack's effectiveness transfers to closed-source models including GPT-4o and GPT-4.1.",
    390       "evidence": "Tables 4 and 6 show TopicAttack achieves 99.78-100% ASR without defense and 60.44-99.56% under defenses on GPT-4o-mini, GPT-4o, and GPT-4.1, significantly outperforming all baselines on both chatbot and agent scenarios.",
    391       "supported": "strong"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No uncertainty quantification across hundreds of comparisons",
    397       "detail": "All results are single-run point estimates with no confidence intervals, error bars, or variance measures. With 900 binary outcomes per cell, even 95% CI would be informative (±~2pp). The paper makes claims of 'significantly outperforms' without any statistical tests."
    398     },
    399     {
    400       "flag": "Unequal compute comparison with baselines",
    401       "detail": "TopicAttack requires GPT-4o API calls to generate multi-turn transition prompts for each attack instance, while baseline attacks (Naive, Ignore, Escape) are simple prompt templates. This gives TopicAttack a substantial compute advantage that is never discussed or accounted for."
    402     },
    403     {
    404       "flag": "Self-implementation of all baselines",
    405       "detail": "The authors implement all baseline attack methods themselves rather than using official implementations. This introduces potential self-comparison bias where baselines may be implemented suboptimally, as documented by Lucic et al. (2018)."
    406     },
    407     {
    408       "flag": "No analysis of failure modes",
    409       "detail": "On Llama3-8B with SecAlign, TopicAttack achieves only 0.44% ASR (Table 1), nearly complete failure. This is never analyzed or discussed. Understanding why the attack fails against this specific model-defense combination could reveal important limitations."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    415       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    416       "year": 2023,
    417       "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications, directly motivates TopicAttack."
    418     },
    419     {
    420       "title": "Evaluating the instruction-following robustness of large language models to prompt injection",
    421       "authors": ["Zekun Li", "Baolin Peng", "Pengcheng He", "Xifeng Yan"],
    422       "year": 2023,
    423       "relevance": "Evaluates LLM vulnerability to prompt injection, establishing the instruction-following robustness problem."
    424     },
    425     {
    426       "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents",
    427       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    428       "year": 2024,
    429       "arxiv_id": "2403.02691",
    430       "relevance": "Provides the InjectAgent benchmark dataset used for agent attack evaluation in this paper."
    431     },
    432     {
    433       "title": "StruQ: Defending against prompt injection with structured queries",
    434       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    435       "year": 2024,
    436       "arxiv_id": "2402.06363",
    437       "relevance": "Fine-tuning-based defense method that TopicAttack is evaluated against; represents strong adversarial training defense."
    438     },
    439     {
    440       "title": "SecAlign: Defending against prompt injection with preference optimization",
    441       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"],
    442       "year": 2025,
    443       "arxiv_id": "2410.05451",
    444       "relevance": "DPO-based defense that represents the strongest tested defense against prompt injection; TopicAttack is the first method to substantially bypass it."
    445     },
    446     {
    447       "title": "Defending against indirect prompt injection attacks with spotlighting",
    448       "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"],
    449       "year": 2024,
    450       "arxiv_id": "2403.14720",
    451       "relevance": "Proposes the Spotlight defense using special tokens to mark data regions, one of the key defense baselines evaluated."
    452     },
    453     {
    454       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    455       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    456       "year": 2024,
    457       "relevance": "Formalizes the Combined attack baseline and provides systematic benchmarking framework for prompt injection research."
    458     },
    459     {
    460       "title": "The instruction hierarchy: Training LLMs to prioritize privileged instructions",
    461       "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"],
    462       "year": 2024,
    463       "arxiv_id": "2404.13208",
    464       "relevance": "Proposes training LLMs to distinguish instruction privilege levels as a defense against prompt injection."
    465     },
    466     {
    467       "title": "AgentDojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents",
    468       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"],
    469       "year": 2024,
    470       "relevance": "Provides a dynamic evaluation environment for prompt injection in LLM agent settings, relevant to the agent attack scenario."
    471     },
    472     {
    473       "title": "Universal and transferable adversarial attacks on aligned language models",
    474       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J Zico Kolter", "Matt Fredrikson"],
    475       "year": 2023,
    476       "arxiv_id": "2307.15043",
    477       "relevance": "Proposes the GCG gradient-based attack method, used as a baseline comparison for TopicAttack."
    478     },
    479     {
    480       "title": "Can indirect prompt injection attacks be detected and removed?",
    481       "authors": ["Yulin Chen", "Haoran Li", "Yuan Sui", "Yufei He", "Yue Liu", "Yangqiu Song", "Bryan Hooi"],
    482       "year": 2025,
    483       "arxiv_id": "2502.16580",
    484       "relevance": "Constructs the Inj-SQuAD and Inj-TriviaQA datasets used for chatbot attack evaluation in this paper."
    485     },
    486     {
    487       "title": "Machine against the RAG: Jamming retrieval-augmented generation with blocker documents",
    488       "authors": ["Avital Shafran", "Roei Schuster", "Vitaly Shmatikov"],
    489       "year": 2024,
    490       "arxiv_id": "2406.05870",
    491       "relevance": "Demonstrates attacks against RAG systems via document poisoning, a related attack vector to indirect prompt injection."
    492     }
    493   ],
    494   "engagement_factors": {
    495     "practical_relevance": {
    496       "score": 2,
    497       "justification": "Security researchers and red-teamers can use the released code to test LLM applications against topic-transition prompt injection attacks."
    498     },
    499     "surprise_contrarian": {
    500       "score": 1,
    501       "justification": "The idea that gradual topic transitions improve attack success is intuitive; the surprising element is the magnitude of improvement over existing methods against strong defenses."
    502     },
    503     "fear_safety": {
    504       "score": 3,
    505       "justification": "Demonstrates that even the strongest current defenses (StruQ, SecAlign) can be bypassed with >90% success, raising serious concerns about LLM application security."
    506     },
    507     "drama_conflict": {
    508       "score": 1,
    509       "justification": "Implies current defenses are inadequate but presents this as incremental research without framing it as a major controversy."
    510     },
    511     "demo_ability": {
    512       "score": 2,
    513       "justification": "Code is released on GitHub and uses standard open-source models, so researchers can reproduce the attacks with moderate setup effort."
    514     },
    515     "brand_recognition": {
    516       "score": 1,
    517       "justification": "From NUS and HKUST — respected but not household-name AI labs. Tests against GPT-4o adds some brand recognition."
    518     }
    519   }
    520 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs