ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24934B)


      1 {
      2   "paper": {
      3     "title": "Enhancing Security in Large Language Models: A Comprehensive Review of Prompt Injection Attacks and Defenses",
      4     "authors": ["Eleena Sarah Mathew"],
      5     "year": 2025,
      6     "venue": "Journal of Artificial Intelligence",
      7     "doi": "10.32604/jai.2025.069841"
      8   },
      9   "scan_version": 3,
     10   "active_modules": ["survey_methodology"],
     11   "methodology_tags": ["meta-analysis"],
     12   "key_findings": "This narrative review covers a small set of prompt injection attacks (HouYi, VPI) and defenses (StruQ, Signed-Prompt, RA-LLM, LLM Self Defense, PAP defense, GUARDIAN) along with three benchmarks (PromptBench, INJECAGENT, BIPIA). The summarized results suggest defenses like StruQ and RA-LLM can significantly reduce attack success rates (e.g., 98.7% to 10%), though none provide complete protection. The review lacks systematic methodology — no search protocol, no quality assessment of sources, and extremely limited scope despite claiming comprehensiveness.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No code or analysis scripts are released. The paper is a literature review that could have released its classification data or analysis materials but did not."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No dataset or structured extraction of reviewed papers is released. A survey could release its corpus of reviewed papers, extracted data tables, or classification annotations."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment or tooling information is provided. If any analysis scripts or data extraction tools were used, they are not described."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No instructions are provided for reproducing the review. There is no description of how to replicate the paper selection, classification, or synthesis process."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "This is a narrative survey that does not run experiments or perform statistical aggregation. No statistical results of its own are produced."
     41       },
     42       "significance_tests": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "Survey paper with no experiments of its own. No comparative statistical claims are made."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "No experiments conducted. The paper only reports effect sizes from reviewed papers."
     51       },
     52       "sample_size_justified": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "No experiments or statistical analyses are performed in this survey."
     56       },
     57       "variance_reported": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "No experiments are run. The paper only summarizes results from other studies."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not compare itself against prior surveys of prompt injection attacks and defenses. No positioning relative to existing review papers is provided."
     68       },
     69       "baselines_contemporary": {
     70         "applies": false,
     71         "answer": false,
     72         "justification": "No experiments are conducted requiring baseline comparisons."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "No system to ablate — this is a survey paper."
     78       },
     79       "multiple_metrics": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "No experiments are conducted. No evaluation metrics are defined for the review itself."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No system outputs to evaluate. This is a narrative review."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "No experiments requiring test/validation splits."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper organizes its review by category: attacks are broken into direct (role-playing, scenario nesting, context-based, code injection, template-based, obfuscation) and indirect (data poisoning, external source manipulation). Defenses are categorized as prevention-based and detection-based. Tables 1 and 2 summarize results by defense type and attack type respectively."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 6 discusses where defenses fall short: 'StruQ is only capable of defending programmatic applications, which makes it vulnerable against web-based chatbots,' VPI settings do not address all possible cases, and evaluation was limited to 7B and 13B models."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports defense limitations: StruQ struggled against TAP attacks (reducing only to 9-36%), debiasing prompting was less effective for VPI, and LLaMA-2 had lower performance (77%) compared to GPT-3.5 (98%) for LLM Self Defense."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The abstract claims to provide a 'comprehensive review' and a 'systemic classification framework,' but the paper only covers ~6 attack/defense methods and 3 benchmarks. The abstract also claims to 'integrate and compare results from multiple recent benchmarks' but Tables 1 and 2 contain only 2 entries each, which is far from a meaningful comparison."
    115       },
    116       "causal_claims_justified": {
    117         "applies": false,
    118         "answer": false,
    119         "justification": "The paper makes no causal claims of its own. It reports findings from reviewed papers without making independent causal assertions."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title claims 'A Comprehensive Review' but the paper only reviews a handful of methods (~6 attacks/defenses, 3 benchmarks) from a vast and rapidly growing literature. The scope is not bounded — no explicit statement of what is excluded or what makes this sample representative."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": false,
    128         "answer": false,
    129         "justification": "This is a pure survey/taxonomy paper that presents no empirical results of its own."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": false,
    133         "answer": false,
    134         "justification": "No measurements are taken in this survey paper. It reports other papers' measurements."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "No models are used by the authors. This is a literature review."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No prompting is performed. This is a literature review."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No experiments are conducted."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used in this survey."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "No paper selection pipeline is documented. There is no description of search queries, databases searched, inclusion/exclusion criteria, or screening stages. The paper does not explain how the reviewed papers were identified or selected."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations section for the review itself. Section 6 discusses limitations of the reviewed methods (StruQ, VPI, etc.) but not limitations of the review methodology. Section 7 (Conclusion) discusses future directions but not the review's own shortcomings."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity of the review are discussed. The paper does not acknowledge potential selection bias in its choice of papers, the lack of systematic methodology, or the narrow scope of the review."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No explicit scope boundaries are stated. The paper does not clarify what types of attacks/defenses are excluded, what time period is covered, or what criteria determined inclusion. The title claims 'Comprehensive Review' without bounding what that means."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data (e.g., list of all papers considered, screening decisions, extracted data) is made available. The review's data sources are not independently verifiable."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No description of how papers were found or selected. There is no mention of search queries, databases, date ranges, or any systematic collection procedure."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants in this study."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "No data pipeline is documented. There is no PRISMA-style flow diagram, no counts of papers at each screening stage, and no description of the process from initial collection to final inclusion."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The funding statement explicitly says: 'The author did not receive funding from public, commercial, or non-profit sectors.'"
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliation is disclosed: Department of Computer Science and Engineering, Motilal Nehru National Institute of Technology, Allahabad, India."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": false,
    216         "answer": false,
    217         "justification": "The paper is unfunded per the funding statement."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "The paper states: 'The author declares no conflicts of interest to report regarding the present study.'"
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Survey paper — no model evaluation is conducted."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Survey paper — no benchmark evaluation is performed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this literature review."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants. The paper states: 'This study is a literature review that does not involve human subjects.'"
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "Survey paper — no system or method whose costs could be reported."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "Survey paper — no computational experiments are run."
    289       }
    290     },
    291     "survey_methodology": {
    292       "prisma_or_structured_protocol": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No PRISMA flow diagram, no structured search protocol, no reproducible search queries, and no stated review methodology. The paper selection appears entirely ad hoc."
    296       },
    297       "quality_assessment_of_sources": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No quality assessment of the reviewed papers is performed. All cited works are treated as equally valid regardless of their methodological rigor. The review does not use any quality scoring rubric or risk-of-bias assessment."
    301       },
    302       "publication_bias_discussed": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No discussion of publication bias. The paper does not consider whether its sources are biased toward positive results or whether negative findings are underrepresented."
    306       }
    307     }
    308   },
    309   "claims": [
    310     {
    311       "claim": "HouYi revealed vulnerabilities in 31 of 36 tested LLM-powered applications, achieving an 86.1% attack success rate.",
    312       "evidence": "Section 3.1 cites Liu et al. (2024) [reference 36]. The claim is directly imported from the reviewed paper with no independent verification.",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "StruQ reduced attack success rates to below 2% for nearly all attack types on Alpaca and Mistral, though TAP attacks were only reduced to 9% (Alpaca) and 36% (Mistral).",
    317       "evidence": "Section 4.1, Table 1. Results are directly cited from Chen et al. (2024) [reference 38].",
    318       "supported": "moderate"
    319     },
    320     {
    321       "claim": "RA-LLM reduced attack success rates from 98.7% to 10% on Vicuna-7B and from 96% to 6.7% on Guanaco-7B.",
    322       "evidence": "Section 4.3, Table 1. Results are directly cited from Phute et al. (2024) [reference 17].",
    323       "supported": "moderate"
    324     },
    325     {
    326       "claim": "This paper provides a 'comprehensive review' with a 'systemic classification framework' for prompt injection attacks and defenses.",
    327       "evidence": "Abstract and Section 2. The classification distinguishes direct vs indirect attacks and prevention vs detection defenses, but only ~6 methods are reviewed in detail. Tables 1 and 2 contain only 2 entries each.",
    328       "supported": "weak"
    329     },
    330     {
    331       "claim": "VPI can change half of answers on targeted topics by poisoning just 0.1% of instruction tuning data.",
    332       "evidence": "Section 3.2 cites Yan et al. (2024) [reference 37]. The claim is imported from the reviewed paper.",
    333       "supported": "moderate"
    334     },
    335     {
    336       "claim": "LLM Self Defense achieves 99% accuracy for GPT-3.5 and 94.6% for LLaMA-2 in classifying harmful content.",
    337       "evidence": "Section 4.4 cites Phute et al. (2024) [reference 17]. Results directly imported from reviewed paper.",
    338       "supported": "moderate"
    339     }
    340   ],
    341   "red_flags": [
    342     {
    343       "flag": "Comprehensive claim with narrow scope",
    344       "detail": "The title claims 'A Comprehensive Review' but only ~6 attack/defense methods and 3 benchmarks are reviewed in detail. The prompt injection literature is vastly larger. Tables 1 and 2 contain only 2 entries each, which is insufficient for meaningful comparison."
    345     },
    346     {
    347       "flag": "No systematic review methodology",
    348       "detail": "No search protocol, no stated databases, no search queries, no inclusion/exclusion criteria, no PRISMA diagram. The paper selection appears entirely ad hoc, making it impossible to assess completeness or bias."
    349     },
    350     {
    351       "flag": "No quality assessment of sources",
    352       "detail": "All reviewed papers are treated equally without any assessment of their methodological quality. This is a recognized weakness in survey methodology — the review launders the signal-to-noise ratio of its sources."
    353     },
    354     {
    355       "flag": "Results are entirely imported from reviewed papers",
    356       "detail": "The paper presents no original analysis. All numbers in Tables 1 and 2 and the discussion are directly copied from cited papers without independent verification or cross-comparison under consistent conditions."
    357     },
    358     {
    359       "flag": "No limitations of the review itself discussed",
    360       "detail": "The paper discusses limitations of the reviewed methods but never acknowledges limitations of its own review methodology, such as the narrow scope, lack of systematic search, or potential selection bias."
    361     },
    362     {
    363       "flag": "Writing quality concerns",
    364       "detail": "The paper contains redundant paragraphs (the introduction repeats the same points about prevention and detection-based defenses in Section 1), the abstract mentions 'INJECENT' (likely a typo for INJECAGENT), and the reference numbering for RA-LLM and LLM Self Defense both point to [17]."
    365     }
    366   ],
    367   "cited_papers": [
    368     {
    369       "title": "Universal and transferable adversarial attacks on aligned language models",
    370       "authors": ["A. Zou", "Z. Wang", "N. Carlini", "M. Nasr", "J.Z. Kolter", "M. Fredrikson"],
    371       "year": 2023,
    372       "arxiv_id": "2307.15043",
    373       "relevance": "Foundational work on automated adversarial attacks against aligned LLMs, demonstrating transfer across open and closed-source models."
    374     },
    375     {
    376       "title": "Prompt injection attack against LLM-integrated applications",
    377       "authors": ["Y. Liu", "G. Deng", "Y. Li", "K. Wang", "Z. Wang", "X. Wang"],
    378       "year": 2024,
    379       "arxiv_id": "2306.05499",
    380       "relevance": "Introduces the HouYi black-box prompt injection attack achieving 86.1% success rate across 36 LLM-integrated applications."
    381     },
    382     {
    383       "title": "Backdooring instruction-tuned large language models with virtual prompt injection",
    384       "authors": ["J. Yan", "V. Yadav", "S. Li", "L. Chen", "Z. Tang", "H. Wang"],
    385       "year": 2024,
    386       "arxiv_id": "2307.16888",
    387       "relevance": "Novel data poisoning attack (VPI) that manipulates LLM behavior by poisoning just 0.1% of instruction tuning data."
    388     },
    389     {
    390       "title": "StruQ: defending against prompt injection with structured queries",
    391       "authors": ["S. Chen", "J. Piet", "C. Sitawarin", "D. Wagner"],
    392       "year": 2024,
    393       "arxiv_id": "2402.06363",
    394       "relevance": "Proposes structured queries as a defense against prompt injection, reducing attack success to below 2% for most techniques."
    395     },
    396     {
    397       "title": "Signed-prompt: a new approach to prevent prompt injection attacks against LLM-integrated applications",
    398       "authors": ["X. Suo"],
    399       "year": 2024,
    400       "arxiv_id": "2401.07612",
    401       "relevance": "Defense strategy using cryptographic signatures on authorized instructions to prevent prompt injection."
    402     },
    403     {
    404       "title": "LLM self defense: by self examination, LLMs know they are being tricked",
    405       "authors": ["M. Phute", "A. Helbling", "M. Hull", "S.-Y. Peng", "S. Szyller", "C. Cornelius"],
    406       "year": 2024,
    407       "arxiv_id": "2308.07308",
    408       "relevance": "Uses a second LLM to screen generated content for harmful output, achieving 99% accuracy with GPT-3.5."
    409     },
    410     {
    411       "title": "How johnny can persuade LLMs to jailbreak them: rethinking persuasion to challenge AI safety by humanizing LLMs",
    412       "authors": ["Y. Zeng", "H. Lin", "J. Zhang", "D. Yang", "R. Jia", "W. Shi"],
    413       "year": 2024,
    414       "arxiv_id": "2401.06373",
    415       "relevance": "Demonstrates persuasive adversarial prompts (PAP) using 40 social science persuasion techniques to jailbreak LLMs."
    416     },
    417     {
    418       "title": "GUARDIAN: A multi-tiered defense architecture for thwarting prompt injection attacks on LLMs",
    419       "authors": ["P. Rai", "S. Sood", "V.K. Madisetti", "A. Bahga"],
    420       "year": 2024,
    421       "doi": "10.4236/jsea.2024.171003",
    422       "relevance": "Multi-layer guardrail framework combining system prompt analysis, fine-tuned classification, and pre-display filtering for LLM safety."
    423     },
    424     {
    425       "title": "PromptBench: towards evaluating the robustness of large language models on adversarial prompts",
    426       "authors": ["K. Zhu", "J. Wang", "J. Zhou", "Z. Wang", "H. Chen", "Y. Wang"],
    427       "year": 2023,
    428       "arxiv_id": "2306.04528",
    429       "relevance": "Benchmark for evaluating LLM resilience to adversarial prompts using Performance Drop Rate metric."
    430     },
    431     {
    432       "title": "InjecAgent: benchmarking indirect prompt injections in tool-integrated large language model agents",
    433       "authors": ["Q. Zhan", "Z. Liang", "Z. Ying", "D. Kang"],
    434       "year": 2024,
    435       "arxiv_id": "2403.02691",
    436       "relevance": "Benchmark for evaluating LLM agent vulnerability to indirect prompt injection with realistic attack scenarios involving tool use."
    437     },
    438     {
    439       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    440       "authors": ["J. Yi", "Y. Xie", "B. Zhu", "E. Kiciman", "G. Sun", "X. Xie"],
    441       "year": 2023,
    442       "arxiv_id": "2312.14197",
    443       "relevance": "BIPIA benchmark evaluating indirect prompt injection risk across 5 application contexts with 250 distinct attacker goals."
    444     },
    445     {
    446       "title": "Prompt injection attacks and defenses in LLM-integrated applications",
    447       "authors": ["Y. Liu", "Y. Jia", "R. Geng", "J. Jia", "N.Z. Gong"],
    448       "year": 2023,
    449       "arxiv_id": "2310.12815",
    450       "relevance": "Comprehensive study of prompt injection attacks and defenses including paraphrasing, retokenization, and LLM-based detection methods."
    451     },
    452     {
    453       "title": "Jailbroken: how does LLM safety training fail?",
    454       "authors": ["A. Wei", "N. Haghtalab", "J. Steinhardt"],
    455       "year": 2023,
    456       "arxiv_id": "2307.02483",
    457       "relevance": "Analyzes failure modes of LLM safety training, foundational for understanding jailbreak vulnerabilities."
    458     },
    459     {
    460       "title": "Defending against indirect prompt injection attacks with spotlighting",
    461       "authors": ["K. Hines", "G. Lopez", "M. Hall", "F. Zarfati", "Y. Zunger", "E. Kiciman"],
    462       "year": 2024,
    463       "arxiv_id": "2403.14720",
    464       "relevance": "Defense technique using spotlighting to protect against indirect prompt injection attacks on LLMs."
    465     }
    466   ],
    467   "engagement_factors": {
    468     "practical_relevance": {
    469       "score": 1,
    470       "justification": "Provides a high-level overview of attack/defense landscape useful for awareness but offers no new tools, techniques, or actionable guidance."
    471     },
    472     "surprise_contrarian": {
    473       "score": 0,
    474       "justification": "Confirms well-known concerns about prompt injection vulnerabilities without challenging any conventional wisdom."
    475     },
    476     "fear_safety": {
    477       "score": 2,
    478       "justification": "Reviews high attack success rates (86.1% for HouYi) and incomplete defenses, reinforcing concerns about LLM security."
    479     },
    480     "drama_conflict": {
    481       "score": 0,
    482       "justification": "No controversy, no strong claims against any organization or established approach."
    483     },
    484     "demo_ability": {
    485       "score": 0,
    486       "justification": "No code, demo, or tool is provided. Purely textual review."
    487     },
    488     "brand_recognition": {
    489       "score": 1,
    490       "justification": "Mentions GPT-4, ChatGPT, and LLaMA but is authored by a single researcher at a lesser-known institution."
    491     }
    492   }
    493 }

Impressum · Datenschutz