scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28729B)
      1 {
      2   "paper": {
      3     "title": "Securing AI Agents Against Prompt Injection Attacks: A Comprehensive Benchmark and Defense Framework",
      4     "authors": ["Badrinath Ramakrishnan", "Akshaya Balaji"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2511.15759",
      8     "doi": "10.48550/arXiv.2511.15759"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "The paper proposes a multi-layered defense framework for RAG systems against prompt injection attacks, combining content filtering, hierarchical guardrails, and response verification. Evaluated across 7 LLMs and 847 attack test cases, the combined framework reportedly reduces attack success rates from 73.2% to 8.7% while retaining 94.3% of task performance. However, the paper has significant integrity concerns: placeholder funding acknowledgment, no author affiliations, potentially fabricated references with suspiciously sequential page numbers (1234-1245 and 12345-12356), and claimed artifact release with no URLs provided.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The abstract claims 'We release our benchmark dataset and defense implementations' but the paper contains no repository URL, Zenodo archive, or any working link to code. A promise without a URL counts as NO."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper describes an 847-case benchmark dataset and claims to release it, but provides no download link or repository URL anywhere in the paper."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, conda environment, or library version specifications are provided. The paper does not describe the software environment used for experiments."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The experimental setup (Section 6.1) describes which models and configurations were tested but lacks operational details for reproduction."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Tables 2 and 3 are point estimates (e.g., '73.2%', '8.7%', '94.3%') with no confidence intervals, error bars, or uncertainty quantification."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims improvements across defense configurations and differences across models without any statistical significance tests. No p-values, t-tests, or bootstrap tests are reported."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports effect sizes with baseline context: 'reduces successful attack rates from 73.2% to 8.7%' (Table 2), '88.1% reduction from baseline' (Section 6.2), and per-component reductions in the ablation (Section 6.5). These provide sufficient context for the magnitude of effects."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The benchmark contains 847 adversarial test cases and 500 benign contexts. No justification is given for why these sizes were chosen or whether they provide sufficient statistical power for the claims made."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. It is unclear whether experiments were run multiple times. All results appear to be single-run point estimates."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Table 2 includes a no-defense baseline configuration and incremental defense additions (filtering only, +guardrails, full defense). This provides a clear progression showing each layer's contribution."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The paper compares only against its own defense components (ablation) and a no-defense baseline. It does not compare against prior defense methods cited in the related work (Hines et al., Liu et al., Zhang et al.). Additionally, the 7 evaluated models are all from 2023 (GPT-4-0613, Claude 2.1, PaLM 2, Llama 2) — significantly outdated for a November 2025 paper."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 6.5 presents systematic ablation studies showing each defense component's contribution. Table 2 also serves as an ablation by adding components incrementally."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper uses four evaluation metrics: Attack Success Rate (ASR), False Positive Rate (FPR), Task Performance Retention (TPR), and Defense Bypass Rate (DBR), as defined in Section 4.3."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation is included. Attack success is determined entirely by automated criteria. Given that prompt injection success can be subjective (partial compliance, indirect data leakage), human evaluation of attack outcomes would be relevant."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The embedding-based anomaly detection uses reference sets R (benign) and A (known attacks), but the paper does not discuss whether these overlap with the test set or whether parameters were tuned on a separate validation split. No train/test separation is described."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 2 provides per-category breakdowns across all five attack types (Direct Injection, Context Manipulation, Instruction Override, Data Exfiltration, Cross-Context Contamination) for each defense configuration."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6.2 notes 'Remaining successful attacks predominantly fall into the advanced sophistication category.' Section 6.4 discusses false positive patterns. Section 6.5 discusses where individual defense components fail (e.g., content filtering only 42% effective against Level 3 attacks)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 6.5 explicitly reports that 'No single mechanism achieves acceptable protection independently.' Content filtering achieves only 42% reduction against Level 3 attacks. Section 7.1 acknowledges that sophisticated semantic injections still succeed."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "Internal inconsistency: the abstract states 'reduces successful attack rates from 73.2% to 8.7%' (88.1% reduction, confirmed in Section 6.2), but Contribution 3 claims '89.4% attack mitigation.' These numbers conflict. The abstract also claims artifacts are released, but no URLs are provided."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The ablation study design (Section 6.5) is adequate for the causal claims about each defense component's contribution. Components are added incrementally in a controlled manner (Table 2), supporting claims like 'content filtering reduces ASR to 41.0%' and 'guardrails provide additional protection.'"
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims 'Securing AI Agents' broadly, but the work tests only RAG-based text pipelines with 7 specific (outdated) models. Section 7.1 discusses some scope limitations (English only, static attacks, text only) but the title and abstract frame the contributions as general to 'AI agents' rather than bounding to RAG systems."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "Section 7 discusses limitations but does not consider alternative explanations for the results. For example: could the low attack success in the full framework be an artifact of the benchmark being too easy? Could model-specific results reflect training data exposure to attack patterns rather than architectural differences? No such alternatives are discussed."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures attack success rate (whether the model exhibits intended malicious behavior) and frames results as attack success rates rather than overclaiming broader security. Task performance is measured via established benchmarks (MMLU, HellaSwag). The measurements align with the claims at a reasonable granularity."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 6.1 specifies exact model versions: 'GPT-4 (gpt-4-0613)', 'GPT-3.5-turbo (gpt-3.5-turbo-16k)', 'Claude 2.1 (claude-2.1)', 'PaLM 2 (text-bison-001)', 'Llama 2 70B Chat', 'Mistral 7B Instruct', 'Vicuna 13B v1.5'."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Algorithm 1 shows the hierarchical prompt construction procedure in pseudocode, but the actual prompt text is not provided. The 'immutable system instructions' (πcore) and 'injection awareness directives' (πguard) are described conceptually but their actual text is not given. A template/pseudocode does not count."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Equation 1 introduces hyperparameters α and β for the anomaly score, but their values are not reported. No LLM generation hyperparameters (temperature, top-p, max tokens) are stated for any of the 7 models."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The defense pipeline is described in detail across Sections 5.1-5.3 with Figure 1 showing the architecture: embedding analysis → content filtering → guardrail application → LLM generation → response verification. Algorithm 1 provides the prompt construction procedure."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "Section 4.1 describes a 'multi-phase process combining manual curation, automated variation generation, and expert validation' with 200 base templates expanded to 847 cases, but the automated variation generation method, expert validation criteria and process, and how the 500 benign contexts were constructed are not documented."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7.1 'Limitations and Future Work' provides a dedicated subsection discussing four specific limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 7.1 discusses specific threats: English-only benchmark (multilingual attacks not covered), static attack patterns (adaptive adversaries not modeled), text-only RAG (multimodal not addressed), response verification as potential single point of failure. These are specific to this study."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 7.1 explicitly states what was not tested: multilingual attacks, adaptive adversaries, multimodal agents, and ensemble verification. Section 7.2 acknowledges that 'Not all applications require maximum security' and discusses deployment-specific considerations."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Despite claiming to release the benchmark dataset, no URL or access mechanism is provided. The 847 test cases, 500 benign contexts, and per-model results are not available for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "Section 4.1 provides a high-level description ('multi-phase process combining manual curation, automated variation generation, and expert validation') but lacks detail on how base attack templates were created, what 'automated variation' entailed technically, and how 'expert validation' was conducted."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The paper mentions 'expert validation' of the benchmark but does not describe who the experts were, how they were selected, or their qualifications. No information about the validators is provided."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The pipeline from 200 base templates to 847 total cases is mentioned but the transformation steps are not documented. How 'variations in phrasing, obfuscation, and sophistication' were generated, how many were rejected during validation, and the specific filtering criteria are absent."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The acknowledgments section contains a placeholder: 'This work was supported by [funding source].' This is literally a template that was not filled in, not an actual funding disclosure."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "The authors are listed by name only (Badrinath Ramakrishnan, Akshaya Balaji) with no institutional affiliations provided anywhere in the paper."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Cannot be assessed because the funding source is a placeholder '[funding source]'. The absence of actual funding information prevents any evaluation of independence."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement or financial disclosure is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper tests a defense framework against prompt injection, not model knowledge on benchmarks. The MMLU/HellaSwag measurements are secondary task performance retention checks. The primary evaluation is about defense mechanism effectiveness, not model capability."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "The paper tests defenses/tools rather than model knowledge. The benchmark consists of novel adversarial test cases created by the authors, not capability evaluations where train/test overlap is the primary concern."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "The paper tests defense mechanisms rather than evaluating pre-trained model capabilities on established benchmarks. Contamination in the traditional sense (model memorizing answers) is not the relevant concern here."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. The paper evaluates automated defense mechanisms against adversarial test cases."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The study involves only automated evaluation of language models and defense mechanisms."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Section 6.6 reports latency: content filtering adds 23ms (15ms embedding, 8ms anomaly detection), response verification adds 45ms, total defense overhead is ~2.1% of end-to-end latency for GPT-4 (3.2s average generation)."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Section 6.6 mentions memory requirements (180MB for embedding storage, 250MB for verification classifier) but does not state the total computational budget for running all experiments across 7 models and 1,347 test cases (847 adversarial + 500 benign)."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of random seeds, seed sensitivity, or multiple runs with different seeds. All results appear to be from single runs."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The paper does not state how many experimental runs produced the reported results. No mention of averaging over runs or trial counts."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The anomaly detection uses hyperparameters α, β, and a threshold, but no search budget, search method, or number of configurations tried is reported."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper does not explain how the final hyperparameter values (α, β, threshold) were selected or whether selection used a separate validation set. Only the 'best' configuration results are shown."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper makes comparisons across 7 models × 4 configurations × 5 attack categories with no correction for multiple comparisons. No Bonferroni, Holm, or other family-wise corrections are applied."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors evaluate their own defense framework against their own benchmark with no acknowledgment of author-evaluation bias. No independent evaluation or discussion of this bias is present."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Section 6.6 reports computational overhead as a fixed cost, but does not analyze how defense effectiveness varies with compute budget (e.g., performance curves at different threshold settings or reference set sizes)."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper does not discuss whether its 847-case benchmark actually measures real-world prompt injection risk. No analysis of construct validity, comparison with real-world attack distributions, or discussion of whether benchmark performance translates to production security."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "The same defense framework architecture is applied consistently across all 7 models (Section 6.1), controlling for scaffold differences in cross-model comparisons. All models are tested under identical defense configurations."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether the evaluated models may have been trained on similar prompt injection patterns or defense strategies. Given that prompt injection is widely discussed online, models trained on web data may have implicit defenses. This is not addressed."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information about expected behavior. For instance, the hierarchical guardrails explicitly warn the model about adversarial content, which could be considered feature leakage in the baseline comparison."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The 847 test cases are generated from 200 base templates through 'automated variation generation.' The paper does not discuss whether variations from the same base template are sufficiently independent, raising concerns about pseudo-replication."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection or prevention method is used. The paper does not analyze whether models have prior exposure to the attack patterns in their training data."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "The combined defense framework reduces successful attack rates from 73.2% to 8.7%.",
    365       "evidence": "Table 2 shows overall ASR dropping from 73.2% (baseline) to 8.7% (full defense) across all attack categories and models. Section 6.2 confirms '88.1% reduction from baseline.'",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "The defense framework maintains 94.3% of baseline task performance.",
    370       "evidence": "Table 3 shows task performance of 94.3% with full defenses, measured on MMLU, HellaSwag, and domain-specific QA benchmarks (Section 6.4). However, detailed per-benchmark breakdowns are not provided.",
    371       "supported": "weak"
    372     },
    373     {
    374       "claim": "Content filtering alone reduces attack success to 41.0%.",
    375       "evidence": "Table 2 shows the +Filtering column with 41.0% overall ASR. Section 6.5 notes it reduces Level 1 attacks by 78% but only 42% for Level 3 attacks.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "No single defense mechanism achieves acceptable protection independently.",
    380       "evidence": "Section 6.5 ablation results show each component's partial effectiveness. Content filtering: 78% Level 1 / 42% Level 3 reduction. Guardrails: 62-67% reduction. Response verification: catches 60% of attacks bypassing first two layers.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Claude 2.1 exhibits the lowest baseline attack success rate (61.4%).",
    385       "evidence": "Figure 2 bar chart shows Claude 2.1 with lowest baseline ASR. However, no statistical tests support the comparison, and no variance is reported.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Defense overhead represents roughly 2.1% of end-to-end latency.",
    390       "evidence": "Section 6.6 reports 23ms (filtering) + 45ms (verification) = 68ms overhead against 3.2s GPT-4 generation time, yielding ~2.1%. This is a straightforward calculation.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Potentially fabricated references",
    397       "detail": "Two references have suspiciously sequential page numbers: Hines et al. (2023) 'pages 1234-1245' in EMNLP, and Zhang et al. (2023) 'pages 12345-12356' in ICML. Page numbers starting at 1234 and 12345 are extremely unlikely in real proceedings. The author names in these references (Robert Hines, Jennifer Wu, Sarah Zhang; Michael Zhang, Xiaoming Wang, Linda Chen) do not match verifiable EMNLP 2023 or ICML 2023 publications on these topics."
    398     },
    399     {
    400       "flag": "Placeholder funding acknowledgment",
    401       "detail": "The acknowledgments section contains the literal placeholder text '[funding source]', indicating the paper was not fully prepared. This is a submission template artifact."
    402     },
    403     {
    404       "flag": "No author affiliations",
    405       "detail": "The two authors are listed by name only with no institutional affiliations, which is unusual for a published arXiv paper and prevents assessment of potential conflicts of interest."
    406     },
    407     {
    408       "flag": "Claimed artifact release with no URLs",
    409       "detail": "The abstract states 'We release our benchmark dataset and defense implementations' but the paper contains zero URLs, repository links, or download locations. This is Open Source Theater — claiming openness without delivering."
    410     },
    411     {
    412       "flag": "Internal numerical inconsistency",
    413       "detail": "Contribution 3 claims '89.4% attack mitigation' but Section 6.2 calculates the reduction from 73.2% to 8.7% as '88.1% reduction from baseline.' The abstract also states the 73.2% to 8.7% figure. These are inconsistent (88.1% ≠ 89.4%)."
    414     },
    415     {
    416       "flag": "No error bars or variance on any result",
    417       "detail": "All results across Tables 2, 3, and Figures 2 are point estimates with no confidence intervals, standard deviations, or repeated-run analysis. For stochastic LLM outputs, this is a significant omission — results could vary meaningfully across runs."
    418     },
    419     {
    420       "flag": "Severely outdated model selection",
    421       "detail": "A November 2025 paper evaluates models from 2023 only (GPT-4-0613, Claude 2.1, PaLM 2, Llama 2 70B). No models from 2024-2025 (GPT-4o, Claude 3/3.5, Llama 3, Gemini) are included. The security landscape has changed significantly with newer models' improved instruction following."
    422     },
    423     {
    424       "flag": "No comparison with prior defense methods",
    425       "detail": "The paper cites three prior defense approaches (Hines et al., Liu et al., Zhang et al.) in related work but does not compare against any of them experimentally. The only baselines are no-defense and the paper's own ablated components."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "Ignore Previous Prompt: Attack Techniques for Language Models",
    431       "authors": ["Fábio Perez", "Ian Ribeiro", "Deep Ganguli"],
    432       "year": 2022,
    433       "arxiv_id": "2211.09527",
    434       "relevance": "Foundational work on prompt injection attacks demonstrating that language models cannot reliably distinguish instructions from data."
    435     },
    436     {
    437       "title": "You've Been Prompted: Indirect Prompt Injection in Applications Using Large Language Models",
    438       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    439       "year": 2023,
    440       "arxiv_id": "2302.12173",
    441       "relevance": "Demonstrated indirect prompt injection through web content in RAG systems, showing attackers can compromise deployed commercial systems by poisoning retrieval sources."
    442     },
    443     {
    444       "title": "Prompt Injection Attack Against LLM-Integrated Applications",
    445       "authors": ["Yi Liu", "Gelei Deng", "Zhengzi Xu", "Yuekang Li", "Yaowen Zheng", "Ying Zhang", "Lida Zhao", "Tianwei Zhang", "Yang Liu"],
    446       "year": 2023,
    447       "arxiv_id": "2306.05499",
    448       "relevance": "Explored prompt injection attacks against LLM-integrated applications, relevant to understanding the attack surface in deployed AI systems."
    449     }
    450   ]
    451 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs