ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25692B)


      1 {
      2   "paper": {
      3     "title": "Prompt Injection Attacks on Large Language Models: A Survey of Attack Methods, Root Causes, and Defense Strategies",
      4     "authors": [
      5       "Tongcheng Geng",
      6       "Zhiyuan Xu",
      7       "Yubin Qu",
      8       "W. Eric Wong"
      9     ],
     10     "year": 2025,
     11     "venue": "Computers, Materials & Continua (CMC)",
     12     "doi": "10.32604/cmc.2025.074081"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["survey_methodology"],
     16   "methodology_tags": ["meta-analysis"],
     17   "key_findings": "This systematic review synthesizes 128 studies (2022-2025) on prompt injection attacks against LLMs, classifying 26 attack methods across three dimensions (vector, objective, implementation) and 37 defense mechanisms across three categories (input preprocessing, system architecture, model-level). The review identifies a co-evolutionary arms race where defenses lag behind attacks, with attacks achieving >90% success rates against unprotected systems while defenses show varying effectiveness (60-80% detection for input filtering, up to 95% for architectural defenses against known patterns). Root causes are attributed to philosophical dilemmas (value alignment), technical flaws (attention mechanism vulnerabilities, autoregressive constraints), and training deficiencies (catastrophic forgetting, reward hacking).",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No code repository or analysis scripts are provided. The paper states 'Availability of Data and Materials: Not applicable.'"
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "Despite describing a data extraction form (Table 3) with 23 fields applied to 128 papers, the extracted dataset is not released. 'Availability of Data and Materials: Not applicable.'"
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No computational environment is specified. The survey could have released its analysis environment or tools but did not."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No reproduction instructions are provided. While the search strings and databases are documented (Section 2.3), there is no package or instructions to reproduce the full analysis pipeline."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "This is a systematic literature review that does not conduct its own experiments. The quantitative claims (e.g., defense effectiveness percentages) are synthesized from individual studies, not from the authors' own statistical analysis."
     46       },
     47       "significance_tests": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "The paper is a qualitative synthesis survey; it does not run its own experiments requiring significance testing."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "No experiments are conducted by the authors. Effect sizes reported are from individual reviewed studies."
     56       },
     57       "sample_size_justified": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "Survey paper with no experiments of its own. The final corpus size (128 papers) results from systematic search rather than sample size calculation."
     61       },
     62       "variance_reported": {
     63         "applies": false,
     64         "answer": false,
     65         "justification": "No experiments conducted; variance reporting is not applicable to a literature survey."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section 1 explicitly compares against four prior surveys: Peng et al. [21], Rababah et al. [22], Mathew [23], and Kumar et al. [24], identifying specific gaps each survey has."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "All four baseline surveys are from 2024, making them contemporary and relevant comparisons."
     78       },
     79       "ablation_study": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "Survey paper with no system components to ablate."
     83       },
     84       "multiple_metrics": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No experiments are conducted by the authors; evaluation metrics are not applicable."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "No system outputs are generated that would require human evaluation."
     93       },
     94       "held_out_test_set": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "No experiments conducted; held-out test sets are not applicable."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down by attack vector type (direct, indirect, multimodal), attack objective, technical implementation, and defense category (input preprocessing, system architecture, model-level) across multiple tables (Tables 4, 5, 6, 7, 8, 9)."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Sections 5.4.1 and 5.4.3 discuss defense limitations including 15-30% false positive rates, 40-60% bypass rates for prompt engineering, and limited generalization to novel attacks. Section 6.2.2 discusses evaluation framework gaps."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper reports that current defenses are insufficient: 'input filtering suffers from 15%–30% false positive rates,' 'prompt engineering lacks formal security guarantees with 40%–60% bypass rates,' and 'significant gaps persist against novel attack vectors' (Section 7.2)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims about >90% attack success rates, 60-80% detection rates, 95% protection for architectural defenses, and 37 defense approaches are all supported in the body. Table 7 provides quantitative evidence for defense effectiveness, and Table 6 catalogs 37 defense studies."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "Section 7.2 claims 'more capable models exhibit higher vulnerability to text-based attacks (Pearson correlation coefficient 0.6635, p < 0.001)' and frames this as a 'counterintuitive finding' implying causation, but this is correlational analysis with no methodology details provided (what models, what data points, how was capability measured). The root cause analysis (Section 4) makes causal attributions from literature synthesis without systematic causal evidence."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The survey clearly states its scope: 128 peer-reviewed studies from 2022-2025, English language, using defined search strings across nine databases (Section 2.3). IC/EC criteria (Table 1) bound the review to prompt injection specifically."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper presents its synthesis (e.g., that root causes lie in philosophical, technical, and training dimensions) as definitive without considering alternative frameworks or acknowledging that the patterns identified could be artifacts of the literature's own biases."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper measures coverage of published literature and frames this as representing 'the field's' state, without discussing the gap between reviewed papers and the actual state of prompt injection research (e.g., unpublished industry work, classified research, non-English publications)."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "Survey paper that does not use any AI models for its analysis."
    147       },
    148       "prompts_provided": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "The paper does not use prompting."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "Survey paper with no experiments requiring hyperparameters."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 2.3-2.6 documents the full pipeline: search strings based on PICO framework, nine databases searched, 586 initial studies retrieved with per-database counts, IC/EC criteria (Table 1) applied to reduce to 108, snowballing added 20 to reach 128, and quality assessment checklist (Table 2) applied by two independent authors."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "The paper discusses limitations of the field's defense mechanisms (Section 5.4) and evaluation frameworks (Section 6.2), but has no dedicated section discussing limitations of the survey methodology itself. Potential issues like language bias (English only), database coverage gaps, or inter-rater reliability of the data extraction are not systematically addressed."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No specific threats to the validity of the survey itself are discussed. The paper does not address potential issues such as: whether the search strings missed relevant papers, whether the IC/EC criteria were applied consistently, or whether the quality assessment introduces systematic bias."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 2.1 defines prompt injection attacks formally, Section 2.4 lists explicit IC/EC criteria (Table 1) including English language, January 2022-August 2025 timeframe, and specific focus requirements."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The completed data extraction table (Table 3 describes the form) is not released. 'Availability of Data and Materials: Not applicable.' The raw coded data from 128 papers is not available for independent verification."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 2.3 describes the search strategy in detail: PICO-based search strings, nine specific databases (IEEE Xplore, ACM, Science Direct, Springer Link, Wiley, Elsevier, Google Scholar, DBLP, ArXiv), searched on 04 August 2025, with per-database hit counts."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. The paper sources are academic databases described in the methodology."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Section 2 documents the full pipeline: 586 initial studies → 108 after IC/EC criteria → 128 after snowballing → quality assessment → data extraction (Table 3) by two independent authors with third-author validation → data synthesis."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Funding is disclosed: '2023 Higher Education Scientific Research Planning Project of China Society of Higher Education (No. 23PG0408),' Jiangsu Province grants, Nantong Science and Technology Project, and others listed in the Funding Statement."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are listed: State Information Center (Beijing), Hohai University, Jiangsu College of Engineering and Technology, and University of Texas at Dallas."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Funders are Chinese academic research grants and university programs with no commercial interest in the survey's outcomes regarding specific attack or defense technologies."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "'Conflicts of Interest: The authors declare no conflicts of interest to report regarding the present study.'"
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "This is a survey paper that does not evaluate a pre-trained model's capability on any benchmark."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Survey paper; no model evaluation on benchmarks."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Survey paper; no model evaluation on benchmarks."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this systematic literature review."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. 'Ethics Approval: Not applicable.'"
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants. (Paper inclusion/exclusion criteria for the literature search are covered under data_preprocessing_documented.)"
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "Survey paper with no computational method of its own."
    289       },
    290       "compute_budget_stated": {
    291         "applies": false,
    292         "answer": false,
    293         "justification": "Survey paper; no computational experiments."
    294       }
    295     },
    296     "survey_methodology": {
    297       "prisma_or_structured_protocol": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "The paper explicitly follows Kitchenham's guidelines [25] for systematic literature reviews. It uses PICO framework for search string formulation (Section 2.3), defines IC/EC criteria (Table 1), applies snowballing per Wohlin's guidelines [32], uses a quality assessment checklist (Table 2), and documents a structured data extraction form (Table 3). Search process is visualized in Fig. 1."
    301       },
    302       "quality_assessment_of_sources": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "Table 2 presents a quality assessment checklist with four criteria areas (Attack Method, Defense Mechanism, Experimental Evaluation, Reproducibility) with 12 sub-criteria. Section 2.4 states: 'The quality assessment checklist was independently applied to all 128 primary studies by two authors. In case of disagreement, discussions were held to reach a consensus.'"
    306       },
    307       "publication_bias_discussed": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The paper does not discuss publication bias. There is no funnel plot, no discussion of negative-result underrepresentation, and no acknowledgment that the literature may skew toward successful attacks. Section 6.2.2 mentions 'Subjectivity and bias in dataset construction' but this concerns evaluation datasets in the reviewed papers, not publication bias in the survey's own source literature."
    311       }
    312     }
    313   },
    314   "claims": [
    315     {
    316       "claim": "Prompt injection attacks achieve over 90% success rates against unprotected systems.",
    317       "evidence": "Section 7.2 states this as a key finding, and Table 7 shows baseline ASRs of 98.1% (GCG on Vicuna), 79% (adversarial prompts), 100% (DistilBERT), and 50-57% (various settings).",
    318       "supported": "moderate"
    319     },
    320     {
    321       "claim": "Input preprocessing defenses achieve 60%-80% detection rates against known attacks.",
    322       "evidence": "Abstract and Section 7.1 state this. Table 7 shows defense-reduced ASRs ranging from 0% to 27.82%, suggesting detection rates higher than 60-80%. The 60-80% figure does not appear to come directly from the tabulated data.",
    323       "supported": "weak"
    324     },
    325     {
    326       "claim": "Advanced architectural defenses demonstrate up to 95% protection against known patterns.",
    327       "evidence": "Abstract claim. Table 7 shows some defenses achieving near-complete protection (SmoothLLM: 98.1% → 0.8%, Erase-and-Check: 100% → 0%), but these are input preprocessing, not architectural defenses. The 95% figure for architectural defenses is not directly supported by the tabulated data.",
    328       "supported": "weak"
    329     },
    330     {
    331       "claim": "37 distinct defense approaches were identified across three categories.",
    332       "evidence": "Table 6 lists 37 entries across input preprocessing, system architecture, and model-level defense categories. This is directly verifiable from the table.",
    333       "supported": "strong"
    334     },
    335     {
    336       "claim": "More capable models exhibit higher vulnerability to text-based attacks (Pearson r=0.6635, p<0.001).",
    337       "evidence": "Section 7.2 Key Findings states this as a 'counterintuitive finding,' but no methodology is provided for how this correlation was computed — what models were included, how capability was measured, what vulnerability metric was used.",
    338       "supported": "weak"
    339     },
    340     {
    341       "claim": "Defense strategies have evolved from passive input preprocessing to comprehensive multi-layered approaches.",
    342       "evidence": "Table 8 provides a temporal analysis showing the co-evolutionary arms race from 2022-2025 with specific citations at each stage. The progression from basic defenses to system-level architecture is documented across Sections 5.1-5.3.",
    343       "supported": "strong"
    344     },
    345     {
    346       "claim": "Input filtering suffers from 15%-30% false positive rates and detection drops below 40% for novel attacks.",
    347       "evidence": "Section 5.4.1 states these figures but provides no specific citations or aggregation methodology for how these ranges were derived.",
    348       "supported": "weak"
    349     }
    350   ],
    351   "red_flags": [
    352     {
    353       "flag": "Unsourced quantitative claims",
    354       "detail": "Several key claims in the abstract and conclusions (60-80% detection rates, 95% architectural defense effectiveness, 15-30% FPR for input filtering) do not map clearly to the data presented in the paper's own tables. These appear to be informal aggregations without documented methodology."
    355     },
    356     {
    357       "flag": "Unexplained statistical finding",
    358       "detail": "The Pearson correlation claim (r=0.6635, p<0.001) for model capability vs. vulnerability appears in Section 7.2 with no methodology: no description of what data points were used, how capability was measured, or how vulnerability was operationalized. This is a significant empirical claim with zero reproducibility information."
    359     },
    360     {
    361       "flag": "No limitations of own methodology",
    362       "detail": "Despite following Kitchenham's SLR guidelines, the paper has no section discussing limitations of the survey itself — language bias (English only), search engine coverage gaps, potential for inconsistent coding across authors, or whether the quality checklist excluded relevant work."
    363     },
    364     {
    365       "flag": "No replication package",
    366       "detail": "The paper describes a detailed 23-field data extraction form (Table 3) applied to 128 papers but releases neither the coded data nor the analysis. For a paper advocating reproducibility in the field, this is a significant gap. 'Availability of Data and Materials: Not applicable.'"
    367     },
    368     {
    369       "flag": "Publication bias not addressed",
    370       "detail": "The survey does not consider that its source literature may systematically over-represent successful attacks and under-represent failed defense attempts or negative results, potentially inflating attack success rates."
    371     }
    372   ],
    373   "cited_papers": [
    374     {
    375       "title": "Ignore previous prompt: attack techniques for language models",
    376       "authors": ["Fábio Perez", "Ian Ribeiro"],
    377       "year": 2022,
    378       "arxiv_id": "2211.09527",
    379       "relevance": "Foundational paper on direct prompt injection attacks against LLMs."
    380     },
    381     {
    382       "title": "Not what you've signed up for: compromising real-world LLM-integrated applications with indirect prompt injection",
    383       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    384       "year": 2023,
    385       "relevance": "First systematic description of indirect prompt injection attacks via external data sources."
    386     },
    387     {
    388       "title": "Universal and transferable adversarial attacks on aligned language models",
    389       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J. Zico Kolter", "Matt Fredrikson"],
    390       "year": 2023,
    391       "arxiv_id": "2307.15043",
    392       "relevance": "Introduced the GCG attack for generating universal adversarial suffixes against aligned LLMs."
    393     },
    394     {
    395       "title": "HarmBench: a standardized evaluation framework for automated red teaming and robust refusal",
    396       "authors": ["Mantas Mazeika", "Long Phan", "Xuwang Yin"],
    397       "year": 2024,
    398       "relevance": "Standardized evaluation framework for red teaming LLMs, proposed the R2D2 adversarial training method."
    399     },
    400     {
    401       "title": "Agentdojo: a dynamic environment to evaluate prompt injection attacks and defenses for LLM agents",
    402       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic"],
    403       "year": 2024,
    404       "relevance": "First dynamic security evaluation framework for LLM agents in untrusted data environments."
    405     },
    406     {
    407       "title": "Defeating prompt injections by design",
    408       "authors": ["Edoardo Debenedetti", "Ilia Shumailov", "Tianqi Fan"],
    409       "year": 2025,
    410       "arxiv_id": "2503.18813",
    411       "relevance": "CaMeL framework applying software security design principles to defend against prompt injection."
    412     },
    413     {
    414       "title": "Asleep at the keyboard? assessing the security of github copilot's code contributions",
    415       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"],
    416       "year": 2025,
    417       "doi": "10.1145/3610721",
    418       "relevance": "Evaluates security vulnerabilities in LLM-generated code from GitHub Copilot."
    419     },
    420     {
    421       "title": "Jailbroken: how does llm safety training fail?",
    422       "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
    423       "year": 2023,
    424       "relevance": "Identifies two failure modes (competing objectives, mismatched generalization) explaining LLM vulnerability to jailbreaking."
    425     },
    426     {
    427       "title": "InjecAgent: benchmarking indirect prompt injections in tool-integrated large language model agents",
    428       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    429       "year": 2024,
    430       "relevance": "Benchmark framework for evaluating indirect prompt injection attacks on tool-integrated LLM agents."
    431     },
    432     {
    433       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    434       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    435       "year": 2024,
    436       "relevance": "First systematic evaluation framework formalizing prompt injection attack strategies and defense methods."
    437     },
    438     {
    439       "title": "StruQ: defending against prompt injection with structured queries",
    440       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    441       "year": 2025,
    442       "relevance": "Applies control-data separation principles to defend against prompt injection at the API level."
    443     },
    444     {
    445       "title": "Constitutional AI: harmlessness from AI feedback",
    446       "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu"],
    447       "year": 2022,
    448       "arxiv_id": "2212.08073",
    449       "relevance": "Foundational work on AI safety alignment relevant to understanding prompt injection vulnerability root causes."
    450     }
    451   ]
    452 }

Impressum · Datenschutz