ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (22689B)


      1 {
      2   "paper": {
      3     "title": "Prompt Injection attack against LLM-integrated Applications",
      4     "authors": [
      5       "Yi Liu",
      6       "Gelei Deng",
      7       "Yuekang Li",
      8       "Kailong Wang",
      9       "Zihao Wang",
     10       "Xiaofeng Wang",
     11       "Tianwei Zhang",
     12       "Yepang Liu",
     13       "Haoyu Wang",
     14       "Yan Zheng",
     15       "Leo Yu Zhang",
     16       "Yang Liu"
     17     ],
     18     "year": 2023,
     19     "venue": "arXiv",
     20     "arxiv_id": "2306.05499"
     21   },
     22   "scan_version": 2,
     23   "active_modules": ["experimental_rigor"],
     24   "methodology_tags": ["benchmark-eval", "case-study"],
     25   "key_findings": "HOUYI, a black-box prompt injection attack technique inspired by SQL injection and XSS, achieves an 86.1% success rate across 36 real-world LLM-integrated applications, identifying 31 as vulnerable. The attack uses three components (framework, separator, disruptor) with iterative refinement. Ten vendors including Notion confirmed the vulnerabilities. Existing defenses (instruction defense, sandwich defense, XML tagging, etc.) are shown to be insufficient against HOUYI's generated payloads.",
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper mentions implementing HOUYI in Python (2,150 lines) but provides no repository URL or code release."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No dataset of prompts, attack payloads, or application responses is released. Application names are anonymized (Table 5)."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No environment specification, requirements.txt, or dependency details are provided beyond mentioning Python and GPT-3.5-turbo."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No reproduction instructions are provided. The paper describes methodology at a high level but does not provide steps to replicate experiments."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "Results are reported as success counts out of 5 attempts (e.g., '5/5', '3/5') with no confidence intervals or error bars."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No statistical significance tests are used. Claims of effectiveness are based on raw success/failure counts across 5 attempts per scenario."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper reports the overall success rate (86.1%, 31/36 applications) and per-application success rates across exploit scenarios (Table 4), providing baseline context (prior attacks succeed on 2/10)."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No justification for why 36 applications were chosen, why 5 attempts per scenario, or whether this sample size is sufficient for the claims made."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No variance or standard deviation across runs is reported. Each attack is run 5 times but only raw counts are shown."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Section 3 (pilot study) tests three existing attack strategies (direct injection, escape characters, context ignoring) as baselines on 10 applications, achieving only partial success on 2/10."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Baselines include contemporary prompt injection techniques from Perez & Ribeiro (2022), Apruzzese et al. (2023), and Greshake et al. (2023), which were state-of-the-art at the time."
     86       },
     87       "ablation_study": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Section 6.3 presents an ablation study on the separator component with three variants (HOUYI-SYNTAX-ONLY, HOUYI-LANGUAGE-ONLY, HOUYI-SEMANTIC-ONLY), shown in Figure 5."
     91       },
     92       "multiple_metrics": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Five exploit scenarios are tested (prompt leaking, code generation, content manipulation, spam generation, information gathering), each evaluated separately in Table 4."
     96       },
     97       "human_evaluation": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The paper states 'We manually verify each result to ensure its accuracy' (Section 6.1) and 'each prompt injection attack is manually scrutinized to ascertain its success' (Section 6.1)."
    101       },
    102       "held_out_test_set": {
    103         "applies": false,
    104         "answer": false,
    105         "justification": "This is a security testing study on real applications, not a ML model evaluation with train/test splits."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Table 4 provides per-application, per-exploit-scenario breakdown of results. Table 1 provides per-category breakdown for the pilot study."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 6.2 discusses 5 applications that resisted attacks and provides detailed reasons (domain-specific LLMs, internal processing, multimodal models). Section 3.2.2 provides a detailed failure case study."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The pilot study (Section 3) reports that existing attacks mostly fail (Table 1 shows overwhelmingly negative results). Section 6.2 reports 5/36 applications could not be exploited."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Abstract claims of 31/36 vulnerable applications (86.1%), 10 vendor confirmations, and specific attack capabilities (prompt theft, LLM abuse) are all supported by Section 6 results."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The ablation study (Section 6.3) provides controlled single-variable manipulation for the causal claim that each separator strategy contributes to effectiveness. The main claim is descriptive (X applications are vulnerable)."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The title claims 'Prompt Injection attack against LLM-integrated Applications' broadly, but testing was limited to 36 applications from SUPERTOOLS. The paper does not bound its generalization to this specific sample or application types."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No discussion of alternative explanations for why HOUYI succeeds. For instance, the success could be due to the specific LLM versions used at the time of testing, which would change as models update."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper measures success as whether specific keywords appear in output (e.g., correct answer to Q1/Q2) or output deviates from intended functionality. No discussion of whether these proxy measures capture the full security risk claimed (e.g., 'millions of users potentially affected')."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "The paper states 'GPT3.5-turbo' for the feedback/generation LLM but does not specify a snapshot date or version. Target applications' underlying LLM versions are unknown (black-box)."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Table 2 provides concrete examples of framework, separator, and disruptor components. The DECISIONAI example (Section 5.1) shows a complete injected prompt. Table 3 shows disruptor prompts."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 6.1 states 'This model functions under the default parameters, with both the temperature and top_p set as 1' for GPT-3.5-turbo."
    165       },
    166       "scaffolding_described": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The three-phase workflow (context inference, payload generation, feedback refinement) is described in detail in Sections 4-5, with Algorithm 1 providing pseudocode and Figure 4 showing the architecture."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 6.1 describes the evaluation setup: application selection criteria (availability + LLM integration), manual API extraction, documentation collection, and success criteria definitions."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 7 (Discussion) includes subsections on defenses (7.1), separator component limitations (7.2), and reproducibility concerns (7.3)."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 7.3 discusses specific reproducibility threats: 'certain detected vulnerabilities may become non-reproducible over time' due to patching and LLM evolution. Section 7.2 acknowledges the separator strategies 'likely only scratch the surface.'"
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper does not explicitly state what settings or application types are excluded from its claims. The threat model (Section 2.3) excludes some attack vectors but does not bound the generalization of results."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No raw attack logs, prompts, or response data are made available. Applications are anonymized, preventing verification."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section 6.1 describes how applications were selected from SUPERTOOLS, selection criteria (availability, LLM integration), and how attacks were executed (RESTful API extraction, 5 repetitions)."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants. The study tests software applications, not people."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "The pipeline from application selection through attack execution to result classification is described at a high level, but specifics like how many total attack attempts were made, how many internal errors occurred, and filtering criteria are not fully documented."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No funding or acknowledgments section is visible in the paper."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "All author affiliations are listed: Griffith University, NTU, UNSW, HUST, Indiana University, SUSTech, Tianjin University."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No funding information is disclosed, so independence cannot be assessed."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests or financial interests statement is present in the paper."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "This paper tests prompt injection attacks on real applications — it does not evaluate a pre-trained model's capability on a benchmark."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "Not a benchmark evaluation of model knowledge. The study tests security vulnerabilities of deployed applications."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "Not a benchmark evaluation of model knowledge. The study tests security vulnerabilities of deployed applications."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants. The study tests software applications."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No cost of running HOUYI is reported. The paper mentions PAREA's cost ($259.2/day for prompt abuse) but not the cost of executing the attack toolkit itself."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No total computational budget for the attack experiments is stated. The number of API calls, tokens consumed, or time spent is not reported."
    302       }
    303     },
    304     "experimental_rigor": {
    305       "seed_sensitivity_reported": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "No seed sensitivity analysis. Results vary across runs (acknowledged via 5 repetitions) but no systematic seed analysis is reported."
    309       },
    310       "number_of_runs_stated": {
    311         "applies": true,
    312         "answer": true,
    313         "justification": "Section 6.1 states 'we execute each exploit prompt five times' and the pilot study also uses 5 repetitions."
    314       },
    315       "hyperparameter_search_budget": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "No hyperparameter search budget is stated. The iterative refinement process (Algorithm 1) does not report how many iterations were needed per application."
    319       },
    320       "best_config_selection_justified": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "Algorithm 1 describes iterative component generation but does not report how many configurations were tried per application or justify the selection of reported results."
    324       },
    325       "multiple_comparison_correction": {
    326         "applies": false,
    327         "answer": false,
    328         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    329       },
    330       "self_comparison_bias_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The authors evaluate their own HOUYI system against their own implementations of baseline attacks without acknowledging potential author-evaluation bias."
    334       },
    335       "compute_budget_vs_performance": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No analysis of how compute budget (number of iterations, API calls) relates to attack success rate."
    339       },
    340       "benchmark_construct_validity": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether the 5 exploit scenarios and success criteria adequately capture real-world prompt injection risk. The success criteria (keyword matching) may not reflect actual security impact."
    344       },
    345       "scaffold_confound_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "HOUYI is itself the scaffold being evaluated, not comparing models across different scaffolds."
    349       }
    350     }
    351   },
    352   "claims": [
    353     {
    354       "claim": "HOUYI achieves an 86.1% success rate in prompt injection attacks across 36 real-world LLM-integrated applications (31/36 vulnerable).",
    355       "evidence": "Table 4 in Section 6.2 shows per-application results across 5 exploit scenarios with 5 attempts each. 31 applications had at least one successful exploit.",
    356       "supported": "strong"
    357     },
    358     {
    359       "claim": "Existing prompt injection techniques are largely ineffective against real-world LLM-integrated applications.",
    360       "evidence": "Table 1 in Section 3.2 shows existing attacks succeed meaningfully on only 2/10 pilot applications (chatbots), with 0/10 success on Q3 (prompt leaking).",
    361       "supported": "moderate"
    362     },
    363     {
    364       "claim": "10 vendors have validated the discovered vulnerabilities, including Notion (20M+ users).",
    365       "evidence": "Section 6.4 states 10 vendors confirmed findings. Notion, Parea, and Writesonic are named. Direct quotes from Parea developers confirming the vulnerability are provided.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Prompt abuse could cause financial losses of $259.2/day per vulnerable application like PAREA.",
    370       "evidence": "Section 6.4.2 derives this from 90k tokens/minute at $0.002/1k tokens over 1440 minutes. Based on GPT-3.5-turbo pricing.",
    371       "supported": "weak"
    372     },
    373     {
    374       "claim": "Existing defenses (instruction defense, post-prompting, sandwich defense, XML tagging, etc.) are insufficient against HOUYI.",
    375       "evidence": "Section 7.1 states defenses were implemented and evaluated on open-source projects, and HOUYI can 'effectively circumvent these security measures,' but no detailed results are provided.",
    376       "supported": "weak"
    377     }
    378   ],
    379   "red_flags": [
    380     {
    381       "flag": "No code or data release",
    382       "detail": "Despite implementing a 2,150-line Python toolkit, no code, attack logs, or data are released, making the results impossible to independently verify."
    383     },
    384     {
    385       "flag": "Very small sample per application",
    386       "detail": "Only 5 attempts per exploit scenario per application. LLM outputs are stochastic, making 5 attempts insufficient to reliably estimate success rates."
    387     },
    388     {
    389       "flag": "Defense evaluation lacks detail",
    390       "detail": "The claim that HOUYI bypasses all existing defenses (Section 7.1) is stated without detailed experimental results — only 'manual inspection' is mentioned."
    391     },
    392     {
    393       "flag": "Financial loss extrapolation",
    394       "detail": "The $259.2/day loss claim assumes 90k tokens/minute are entirely from malicious prompt abuse, which is unrealistic. The extrapolation methodology is simplistic."
    395     },
    396     {
    397       "flag": "Anonymization prevents verification",
    398       "detail": "Most applications are anonymized (31 of 36), making it impossible to verify which applications were tested or reproduce the results."
    399     }
    400   ],
    401   "cited_papers": [
    402     {
    403       "title": "Ignore Previous Prompt: Attack Techniques For Language Models",
    404       "authors": ["Fábio Perez", "Ian Ribeiro"],
    405       "year": 2022,
    406       "relevance": "Foundational prompt injection attack paper targeting GPT-3, serves as baseline in this study."
    407     },
    408     {
    409       "title": "Not what you've signed up for: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    410       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    411       "year": 2023,
    412       "relevance": "Introduces indirect prompt injection via poisoned external resources, complementary attack vector to HOUYI's direct approach."
    413     },
    414     {
    415       "title": "Jailbreaking ChatGPT via Prompt Engineering: An Empirical Study",
    416       "authors": ["Yi Liu", "Gelei Deng", "Zhengzi Xu", "Yuekang Li"],
    417       "year": 2023,
    418       "relevance": "Related jailbreak attack study by overlapping authors, explores prompt manipulation techniques for LLM safety bypass."
    419     },
    420     {
    421       "title": "Toolformer: Language models can teach themselves to use tools",
    422       "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì"],
    423       "year": 2023,
    424       "relevance": "Demonstrates LLMs as tool-using agents, relevant to understanding the attack surface of LLM-integrated applications."
    425     },
    426     {
    427       "title": "ReAct: Synergizing reasoning and acting in language models",
    428       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
    429       "year": 2023,
    430       "relevance": "Foundational agentic LLM framework combining reasoning and actions, relevant to LLM-integrated application security."
    431     },
    432     {
    433       "title": "Generative agents: Interactive simulacra of human behavior",
    434       "authors": ["Joon Sung Park", "Joseph C O'Brien", "Carrie J Cai"],
    435       "year": 2023,
    436       "relevance": "LLM-backed autonomous agents that could be vulnerable to prompt injection attacks."
    437     },
    438     {
    439       "title": "\"Real Attackers Don't Compute Gradients\": Bridging the Gap between Adversarial ML Research and Practice",
    440       "authors": ["Giovanni Apruzzese", "Hyrum S. Anderson"],
    441       "year": 2023,
    442       "relevance": "Discusses gap between adversarial ML research and real-world attacks, provides context for practical prompt injection."
    443     },
    444     {
    445       "title": "OWASP Top 10 List for Large Language Models",
    446       "year": 2023,
    447       "relevance": "Lists prompt injection as top LLM security risk, providing industry context for this research."
    448     }
    449   ]
    450 }

Impressum · Datenschutz