scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32051B)
      1 {
      2   "paper": {
      3     "title": "QueryIPI: Query-agnostic Indirect Prompt Injection on Coding Agents",
      4     "authors": [
      5       "Yuchong Xie",
      6       "Zesen Liu",
      7       "Mingyu Luo",
      8       "Zhixiang Zhang",
      9       "Kaikai Zhang",
     10       "Yuanyuan Yuan",
     11       "Zongjie Li",
     12       "Ping Chen",
     13       "Shuai Wang",
     14       "Dongdong She"
     15     ],
     16     "year": 2025,
     17     "venue": "arXiv.org",
     18     "arxiv_id": "2510.23675",
     19     "doi": "10.48550/arXiv.2510.23675"
     20   },
     21   "scan_version": 3,
     22   "active_modules": ["experimental_rigor", "data_leakage"],
     23   "methodology_tags": ["benchmark-eval"],
     24   "key_findings": "QueryIPI introduces query-agnostic indirect prompt injection for coding agents, achieving 87% attack success rate on simulated agents with just 8 training samples, far exceeding the best baseline (50%). The method leverages leaked system prompts ('system invariants') to guide black-box optimization of malicious tool descriptions. Attacks transfer from simulation to real-world coding agents at 50% ASR, and resist both prevention-based (Sandwich Defense, Spotlighting) and detection-based (perplexity) defenses.",
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "A GitHub repository URL is provided: 'Source code: https://github.com/QueryIPI/QueryIPI.git' on the first page of the paper."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper uses two publicly available datasets: the Command Injection Payload List (https://github.com/payloadbox/command-injection-payload-list) and LMSYS-Chat-1M (Zheng et al., 2024). Both are public and accessible."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided in the paper. Only the backend LLM (Claude-Sonnet-4) and temperature settings are mentioned."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No step-by-step reproduction instructions are provided in the paper. The algorithm is described (Algorithm 1) and parameter settings given (Section 5.1), but there are no explicit instructions for reproducing the experiments."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "All results in Tables 1-4 are point estimates (ASR values) with no confidence intervals or error bars reported."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper claims 'QueryIPI significantly outperforms the baselines' (Section 5.2) based solely on comparing ASR numbers without any statistical significance tests."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Absolute ASR values are reported with baseline context throughout (e.g., 'average ASR of 0.70 even when trained with just two samples' vs baseline 0.50). Relative improvements are contextualized with the baseline reference points in Tables 1-4."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The test set of 100 cases (10 commands × 10 queries) is used without any justification for why this sample size is adequate. No power analysis is provided."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No variance, standard deviation, or spread measures are reported across runs. Temperature is set to 0 for determinism, but the mutator LLM uses temperature 1, introducing stochasticity in the optimization process whose variance is never measured."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Four baselines are included: AgentDojo, InjecAgent, MCPTox, and TIP (TIPExploit). Results compared in Table 1. The paper also justifies excluding ToolHijacker (Section 5.1)."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "All baselines are from 2024-2025: AgentDojo (2024), InjecAgent (2024), MCPTox (2025), and TIPExploit (2025). These represent recent work in the IPI space."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "RQ2 (Section 5.3, Table 2) ablates the internal prompt component, testing 'w/o Internal Prompt' (average ASR drops from 70% to 20%) and 'Partial Prompt' (71% ASR). This demonstrates the contribution of the system invariant."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "Only a single metric is used: Attack Success Rate (ASR). The perplexity analysis in Section 5.5 is for defense evasion evaluation, not for measuring attack effectiveness. No secondary metrics like attack latency, optimization cost, or attack specificity are reported."
     95       },
     96       "human_evaluation": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Evaluation is entirely automated via an LLM judge (Section B). While the judge was calibrated against human annotators (152 data points, Spearman's ρ = 0.78), human evaluation was not used as a primary evaluation method for the attack results."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Training uses 2/4/8 queries for optimization, while ASR is measured on a separate set of 100 test cases (10 commands × 10 queries) randomly sampled from LMSYS-Chat-1M (Section 5.1)."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Results are broken down per agent (Cursor, Copilot, Cline, Trae, Windsurf) in Tables 1-4, and per training sample count (n=2, 4, 8). Per-command breakdowns are not provided."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 5.3 discusses failure modes when the internal prompt is unavailable (ASR drops to 20%). Section 5.4 acknowledges performance degradation in real-world transfer. The mutation prompt design (Section 4.2) explicitly addresses two failure cases: irrelevant responses and safety refusals."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Appendix D (RQ5) reports poor cross-LLM transferability for Copilot (ASR drops from 85% to near 0% on non-Claude backends). Real-world transfer (Table 3) shows degradation from 87% to 50%. The w/o Internal Prompt ablation shows 20% ASR."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Abstract claims of 70%/82%/87% ASR with 2/4/8 samples are supported by Table 1. The claim that the best baseline achieves 50% is supported by TIP's average in Table 1. Real-world transferability claim is supported by Table 3."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The primary causal claim — that the internal prompt 'is a key driver of the attack's success' (Section 5.3) — is supported by a controlled ablation: removing internal prompt access drops average ASR from 70% to 20% (Table 2). This is a single-variable manipulation with the rest of the framework held constant."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The abstract claims query-agnostic IPI is 'highly effective in practice,' but main results are from simulated agents (not real-world), and real-world transfer (Table 3) shows substantial degradation (87%→50%). The title says 'Coding Agents' broadly, but only 5 IDE agents were tested. Cross-LLM transfer (Table 4) shows highly variable results (0% to 95%)."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "No discussion of alternative explanations for QueryIPI's success. Could the improvement over baselines be due to the iterative optimization alone rather than the system invariant insight? The paper does not consider confounding factors or alternative mechanisms."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper measures ASR (whether a malicious command is executed) and claims this demonstrates attack effectiveness. The measurement directly matches the claim — there is no proxy gap between what is measured and what is claimed."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The backend LLM is specified as 'Claude-Sonnet-4 (Anthropic, 2025)' without a snapshot date or API version. Cross-LLM experiments use 'GPT-5', 'Grok-4', and 'Gemini-2.5-pro' — all marketing names without version specifiers."
    154       },
    155       "prompts_provided": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "Appendix E.1 and E.2 describe the mutation and judge prompts in a modular, third-person summary format ('Role Module: Defines the operational position...'), not the actual prompt text sent to the LLM. The reader cannot reconstruct the exact prompts used."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section 5.1 reports: temperature 0 for target agent and judge, temperature 1 for mutator, 20 generations, 2 variants per generation. These are the key hyperparameters for the optimization process."
    164       },
    165       "scaffolding_described": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The QueryIPI framework is described in detail: Algorithm 1 outlines the iterative optimization, Section 4.2 describes the mutation function (initial seed generation and reflective optimization phases), and the scoring function is detailed in Section B. The simulated agent setup (leaked system prompts + Claude-Sonnet-4, single-turn evaluation) is described in Section 5.1."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "The paper says queries are 'randomly sampled' from LMSYS-Chat-1M and 10 commands are 'selected' from the Command Injection Payload List. No filtering criteria, sampling methodology, or preprocessing steps are described."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 7 'Limitations' is a dedicated section discussing scope restrictions (coding agents only), architectural prerequisites (tool integration support), and operational requirements (capable backend LLM)."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "Section 7 discusses scope boundaries (coding agents, tool integration support) but does not address specific threats to the validity of the experimental findings, such as the representativeness of the simulated environment, adequacy of 100 test cases, or reliability of the LLM judge for final ASR determination."
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 7 explicitly states: 'we limit our scope to coding agents in this work,' the target system 'must support user-defined tools,' and the attack 'is not limited to coding agents but theoretically extends to any framework satisfying these conditions.'"
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No raw experimental data (individual agent responses, judge scores per test case, generated malicious descriptions) is provided or linked. Only aggregated ASR values are reported."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Section 5.1 describes data collection: 10 commands selected from the Command Injection Payload List, queries randomly sampled from LMSYS-Chat-1M, test cases constructed as 10 commands × 10 queries = 100 test cases."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": false,
    206         "answer": false,
    207         "justification": "No human participants. Data sources are standard public datasets (Command Injection Payload List, LMSYS-Chat-1M)."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "The pipeline from agent responses to binary ASR is not fully documented. The scoring function (Section B) produces continuous 0-100 scores, but the paper does not clearly state how final ASR binary success/failure is determined for the test evaluation (e.g., whether score=100 is required or some threshold is used)."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No funding information, acknowledgments section, or grant numbers appear anywhere in the paper."
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "Author affiliations are clearly listed: HKUST, Fudan University, and Tsinghua University. The paper evaluates third-party products (Cursor, Copilot, etc.) with no apparent affiliation conflict."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not evidence of absence of conflict."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests statement or financial disclosure appears in the paper."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "This is a red-teaming/security study that tests attack effectiveness against coding agents, not a benchmark evaluation of model knowledge. Contamination of training data is not the relevant concern."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "The paper tests attack methods, not model knowledge on benchmarks. Train/test contamination in the traditional sense is not applicable."
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "The paper evaluates attack success rates, not model performance on knowledge benchmarks. Benchmark contamination is not a relevant concern for this study type."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study. All experiments are automated evaluations of attack methods against LLM-based coding agents."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants. Section 8 discusses ethical considerations related to dual-use risk of the attack methodology, but no IRB review is applicable."
    264       },
    265       "demographics_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the study."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the study."
    274       },
    275       "randomization_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in the study."
    279       },
    280       "blinding_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in the study."
    284       },
    285       "attrition_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants in the study."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No API costs, tokens consumed, or wall-clock time for the attack optimization or evaluation are reported. The method requires multiple LLM calls (mutator, target agent, judge) across 20 generations but no cost quantification is provided."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No total computational budget (API spend, GPU hours, total calls) is stated for the experiments despite extensive use of Claude-Sonnet-4 across 5 agents × multiple configurations."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "Target agent and judge use temperature 0 for determinism, but the mutator uses temperature 1, introducing stochasticity. No results across multiple random seeds are reported to assess optimization variance."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "It is not stated how many times the full optimization was run per agent-command configuration. Results appear to be from single optimization runs."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "Hyperparameters (20 generations, 2 variants, temperature settings) are stated but no search budget or justification for these values is provided. No description of how these hyperparameters were selected."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": true,
    322         "justification": "The optimization process selects the best-performing description via training set scores (Algorithm 1, SelectTop function), and final evaluation is on a separate test set of 100 cases. For RQ3, they select 'the most effective malicious tool description from our RQ1 analysis' (Section 5.4)."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": false,
    326         "answer": false,
    327         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The authors compare their own QueryIPI against baselines without acknowledging author-evaluation bias. Baselines were manually adapted by the authors ('we manually adapted the malicious payloads to suit each of the 10 malicious commands'), potentially disadvantaging them."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "QueryIPI requires iterative optimization (20 generations with LLM calls for mutation, scoring, and judging) while baselines are non-iterative. This compute difference is not discussed or controlled for."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "The paper does not discuss whether ASR on simulated agents with 100 test cases validly measures real-world attack threat. The sim-to-real gap is acknowledged (Section 5.4) but construct validity of the simulated benchmark is not analyzed."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "All five simulated agents use Claude-Sonnet-4 as backend but differ in system prompts and tool definitions. The paper does not discuss whether performance differences across agents are attributable to scaffold differences vs. other factors. RQ5 varies the backend LLM but does not disentangle model vs. scaffold effects."
    348       }
    349     },
    350     "data_leakage": {
    351       "temporal_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether LMSYS-Chat-1M queries or command injection payloads could have been seen during Claude-Sonnet-4's training, potentially affecting the model's response patterns."
    355       },
    356       "feature_leakage_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether the evaluation setup leaks information. The malicious tool description is directly injected into the agent's context, which is the intended attack vector, but no analysis of whether the test setup creates unrealistic advantages."
    360       },
    361       "non_independence_addressed": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "Training and test queries are both randomly sampled from LMSYS-Chat-1M. No analysis of whether independence between training and test sets is maintained (e.g., duplicate or near-duplicate queries)."
    365       },
    366       "leakage_detection_method": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "No concrete leakage detection or prevention method is applied. No deduplication, temporal splitting, or contamination checking between training and test sets."
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "QueryIPI achieves average ASR of 70%, 82%, and 87% with 2, 4, and 8 training samples across five simulated coding agents.",
    376       "evidence": "Table 1 (Section 5.2) shows per-agent ASR results across training sample configurations. Average ASR: 0.70 (n=2), 0.82 (n=4), 0.87 (n=8).",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "QueryIPI significantly outperforms baselines, with the best baseline (TIP) achieving only 50% average ASR.",
    381       "evidence": "Table 1 shows AgentDojo (0.02), InjecAgent (0.00), MCPTox (0.00), TIP (0.50) average ASR vs QueryIPI's 0.70-0.87.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "The internal prompt is a key driver of attack success.",
    386       "evidence": "Table 2 (Section 5.3) shows removing internal prompt access drops ASR from 70% to 20%. Partial prompt access yields 71% ASR.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Malicious tool descriptions transfer from simulated to real-world coding agents.",
    391       "evidence": "Table 3 (Section 5.4) shows QueryIPI achieves 50% average ASR on real-world agents vs TIP's 2%. Trae (72%) and Cursor (63%) show strongest transfer.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "QueryIPI is robust against prevention-based and detection-based defenses.",
    396       "evidence": "Section 5.5: Sandwich Defense yields 82% ASR, Spotlighting variants 52-84% ASR. Perplexity (59.04) is far below real MCP tools (1423.27).",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "QueryIPI has notable cross-LLM transferability with 26% average ASR across different backends.",
    401       "evidence": "Table 4 (Appendix D) shows variable results: 40% average on GPT-5, 21% on Grok-4, 16% on Gemini-2.5-pro. Copilot drops to near 0% for all non-Claude backends.",
    402       "supported": "weak"
    403     }
    404   ],
    405   "red_flags": [
    406     {
    407       "flag": "Simulated environment as primary evaluation",
    408       "detail": "Main results (Table 1) use simulated agents built from leaked system prompts + Claude-Sonnet-4, not actual coding agents. Real-world transfer (Table 3) shows substantial degradation from 87% to 50% ASR, indicating the simulation may overestimate attack effectiveness."
    409     },
    410     {
    411       "flag": "No error bars or uncertainty quantification",
    412       "detail": "All results are point estimates from apparently single optimization runs. The mutator LLM uses temperature 1, introducing stochasticity, yet no variance across multiple runs is reported. Different optimization runs could yield very different malicious descriptions."
    413     },
    414     {
    415       "flag": "Small and unjustified test set",
    416       "detail": "ASR is measured on only 100 test cases (10 commands × 10 queries) with no justification for this sample size. With such small N per agent-command combination, the reported ASR values have wide implicit confidence intervals."
    417     },
    418     {
    419       "flag": "Baseline adaptation by authors",
    420       "detail": "The authors manually adapted baseline payloads for each of the 10 malicious commands (Section 5.1). This introduces potential bias — the baselines were designed for different settings and may perform worse when adapted by the competing method's authors."
    421     },
    422     {
    423       "flag": "Unfair compute comparison with baselines",
    424       "detail": "QueryIPI uses iterative optimization (20 generations × multiple LLM calls) while baselines are static injection strategies. The compute advantage is neither quantified nor controlled for, making it unclear how much of the improvement comes from the method's insight vs. simply more computation."
    425     },
    426     {
    427       "flag": "LLM judge reliability for final evaluation",
    428       "detail": "The scoring function uses a continuous 0-100 scale, but the paper reports binary ASR without clearly specifying the threshold. The judge was validated on 152 data points (Spearman's ρ = 0.78), which indicates good but not excellent alignment with human judgment."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    434       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    435       "year": 2023,
    436       "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications."
    437     },
    438     {
    439       "title": "AgentDojo: A dynamic environment to evaluate attacks and defenses for LLM agents",
    440       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunović", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"],
    441       "year": 2024,
    442       "arxiv_id": "2406.13352",
    443       "relevance": "Benchmark for evaluating indirect prompt injection attacks and defenses on LLM agents."
    444     },
    445     {
    446       "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents",
    447       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    448       "year": 2024,
    449       "arxiv_id": "2403.02691",
    450       "relevance": "Benchmark for indirect prompt injection in tool-integrated LLM agents, used as baseline."
    451     },
    452     {
    453       "title": "AgentPoison: Red-teaming LLM agents via poisoning memory or knowledge bases",
    454       "authors": ["Zhaorun Chen", "Zhen Xiang", "Chaowei Xiao", "Dawn Song", "Bo Li"],
    455       "year": 2024,
    456       "arxiv_id": "2407.12784",
    457       "relevance": "Red-teaming approach for LLM agents through poisoning memory and knowledge bases."
    458     },
    459     {
    460       "title": "MCPTox: A benchmark for tool poisoning attack on real-world MCP servers",
    461       "authors": ["Zhiqiang Wang", "Yichao Gao", "Yanting Wang"],
    462       "year": 2025,
    463       "relevance": "Benchmark for tool poisoning attacks on Model Context Protocol servers, used as baseline."
    464     },
    465     {
    466       "title": "AGENTVIGIL: Generic black-box red-teaming for indirect prompt injection against LLM agents",
    467       "authors": ["Zhun Wang", "Vincent Siu", "Zhe Ye"],
    468       "year": 2025,
    469       "relevance": "Generic black-box red-teaming method for indirect prompt injection against LLM agents."
    470     },
    471     {
    472       "title": "Exploit tool invocation prompt for tool behavior hijacking in LLM-based agentic system",
    473       "authors": ["Yuchong Xie", "Mingyu Luo", "Zesen Liu"],
    474       "year": 2025,
    475       "relevance": "Prior work on tool behavior hijacking in LLM agents via tool invocation prompts, used as primary baseline (TIP)."
    476     },
    477     {
    478       "title": "Prompt injection attack to tool selection in LLM agents",
    479       "authors": ["Jiawen Shi", "Zenghui Yuan", "Guiyao Tie"],
    480       "year": 2025,
    481       "relevance": "ToolHijacker attack on tool selection in LLM agents, discussed as related but excluded baseline."
    482     },
    483     {
    484       "title": "Defending against indirect prompt injection attacks with spotlighting",
    485       "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"],
    486       "year": 2024,
    487       "arxiv_id": "2403.14720",
    488       "relevance": "Defense mechanism against indirect prompt injection using data marking and delimiting, evaluated as defense in this paper."
    489     },
    490     {
    491       "title": "Tree of attacks: Jailbreaking black-box LLMs automatically",
    492       "authors": ["Anay Mehrotra", "Manolis Zampetakis", "Paul Kassianik"],
    493       "year": 2024,
    494       "relevance": "Automated jailbreaking framework (TAP) whose scoring approach inspired QueryIPI's scoring function."
    495     },
    496     {
    497       "title": "Adaptive attacks break defenses against indirect prompt injection attacks on LLM agents",
    498       "authors": ["Qiusi Zhan", "Richard Fang", "Henil Shalin Panchal", "Daniel Kang"],
    499       "year": 2025,
    500       "relevance": "Demonstrates that adaptive attacks can bypass defenses against indirect prompt injection in LLM agents."
    501     },
    502     {
    503       "title": "Lost in the middle: How language models use long contexts",
    504       "authors": ["Nelson F Liu", "Kevin Lin", "John Hewitt"],
    505       "year": 2023,
    506       "relevance": "Key finding about LLMs ignoring information in long contexts, motivating QueryIPI's system invariant approach."
    507     },
    508     {
    509       "title": "Jailbroken: How does LLM safety training fail?",
    510       "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
    511       "year": 2023,
    512       "relevance": "Analysis of LLM safety training failure modes, relevant to understanding why prompt injection attacks succeed."
    513     },
    514     {
    515       "title": "Universal and transferable adversarial attacks on aligned language models",
    516       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J Zico Kolter", "Matt Fredrikson"],
    517       "year": 2023,
    518       "relevance": "Foundational work on universal adversarial attacks against aligned LLMs."
    519     }
    520   ],
    521   "engagement_factors": {
    522     "practical_relevance": {
    523       "score": 2,
    524       "justification": "Security researchers and AI tool developers can use the methodology to test coding agent defenses, though it requires significant setup to replicate."
    525     },
    526     "surprise_contrarian": {
    527       "score": 2,
    528       "justification": "Demonstrates that query-agnostic attacks are feasible, challenging the assumption that IPI attacks are limited to specific trigger queries."
    529     },
    530     "fear_safety": {
    531       "score": 3,
    532       "justification": "Shows that widely-used coding agents (Cursor, Copilot, Windsurf, Cline, Trae) can be reliably compromised to execute arbitrary commands regardless of user intent."
    533     },
    534     "drama_conflict": {
    535       "score": 2,
    536       "justification": "Implicates major coding tools as vulnerable and highlights that leaked system prompts enable attacks, touching on supply chain and MCP security concerns."
    537     },
    538     "demo_ability": {
    539       "score": 2,
    540       "justification": "Source code is released on GitHub, though the attack requires setting up simulated agents and the paper deliberately does not release ready-to-use attack payloads."
    541     },
    542     "brand_recognition": {
    543       "score": 2,
    544       "justification": "Targets well-known products (Cursor, GitHub Copilot, Windsurf, Cline, Trae) but authored by academic researchers without major industry affiliation."
    545     }
    546   }
    547 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs