scan.json (21529B)
1 { 2 "paper": { 3 "title": "Signed-Prompt: A New Approach to Prevent Prompt Injection Attacks Against LLM-Integrated Applications", 4 "authors": ["Xuchen Suo"], 5 "year": 2024, 6 "venue": "AIP Conference Proceedings", 7 "arxiv_id": "2401.07612", 8 "doi": "10.48550/arXiv.2401.07612" 9 }, 10 "scan_version": 2, 11 "active_modules": [], 12 "methodology_tags": ["case-study"], 13 "key_findings": "The paper proposes 'Signed-Prompt,' a defense against prompt injection attacks that replaces sensitive instructions with rare character sequences ('signatures') so LLMs can distinguish authorized from unauthorized commands. Two implementations are tested: prompt engineering (ChatGPT-4) achieving 100% correctness and 0% attack success, and fine-tuning (ChatGLM-6B) achieving 73-100% correctness and 0% attack success. However, evaluation is limited to a single command type ('delete') with a custom dataset of unknown size.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The 'Delete Command Dataset' is described and a few examples shown in Table 1, but no download link or public release is provided." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specifications, dependency lists, or setup details are provided. Only the model names (ChatGPT-4, ChatGLM-6B) are mentioned." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No reproduction instructions, scripts, or step-by-step guides are provided. The reader would have to reverse-engineer the experimental setup from the paper description." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Only point estimate correctness rates are reported (100%, 96.67%, 86.67%, 73.34%, 0%). No confidence intervals or error bars are provided." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims 'exceptional effectiveness' and 'remarkable stability' but provides no statistical significance tests to support any comparative claims." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": false, 51 "justification": "Only raw correctness and attack success rates are reported. No effect sizes (Cohen's d, odds ratios, etc.) or baseline-contextualized improvements are provided." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The dataset size is never explicitly stated. Table 1 shows 'first 3 entries each group' but the total N per group is not reported. No justification for sample size is given." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or any spread measure is reported. Results appear to be from single runs with no repeated trials." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": false, 68 "justification": "The paper critiques existing defenses (output filtering, input filtering, delimiters, Dual LLM) in the introduction but does not experimentally compare Signed-Prompt against any of them." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": false, 73 "justification": "No baselines are included in the experiments, so there are no contemporary baselines to evaluate." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "While two implementations are compared (prompt engineering vs fine-tuning), this is not an ablation study removing components. The Encoder and Adjusted LLM components are never tested independently or removed to measure their individual contributions." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Two metrics are used: correctness rate (for signed user commands) and attack success rate (for unsigned attacker commands), as shown in Table 3." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation is conducted. All evaluation is automated through comparison of actual vs expected outputs." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "For LLM-FT, the paper states it was fine-tuned on data 'similar to the previous test data in Table 1,' suggesting potential overlap between training and test data. No explicit train/test separation is described." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results in Tables 1 and 3 are broken down by four groups: Direct, Multilingual, Varied Expression, and Implication." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper discusses LLM-FT's lower correctness rates (86.67% for Direct, 73.34% for Multilingual) and attributes them to the complexity of fine-tuning and potential insufficient fine-tuning of ChatGLM-6B." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The LLM-FT's imperfect correctness rates are reported and discussed as a negative finding, attributed to fine-tuning quality challenges." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": false, 115 "justification": "The abstract claims 'substantial resistance to various types of prompt injection attacks' and 'robust defense strategy,' but experiments only test one command type ('delete') with its variants. The range of 'various types' is extremely narrow — all test cases are semantic variations of a single action." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper claims Signed-Prompt 'enables effective and stable prevention of the execution of attacker's commands,' a causal claim. But the evidence comes from testing only one command type on a custom dataset without controls or comparison to alternative defenses." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper generalizes from testing a single command type ('delete') to claiming defense against prompt injection attacks broadly. The title says 'Prevent Prompt Injection Attacks' without bounding to the specific scenario tested. No mention that results may not extend to other command types, attack strategies, or real-world scenarios." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No alternative explanations are discussed. For example, the 0% attack success rate could be an artifact of the narrow test scope (single command type) rather than evidence of robust defense. No discussion of how attackers might circumvent the approach." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper measures correctness of token replacement and attack success on a custom dataset, then frames this as evidence of a 'robust defense strategy' for prompt injection broadly. The gap between the proxy (single-command substitution correctness) and the claimed outcome (general prompt injection defense) is not acknowledged." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper specifies 'ChatGPT-4 from OpenAI' and 'ChatGLM-6B.' ChatGPT-4 is a marketing name without a version or snapshot date (no 'gpt-4-0613' or similar). ChatGLM-6B is a specific model but no version/checkpoint is given." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper uses prompt engineering for both the Encoder and LLM-PE but never provides the actual system prompts or instructions given to ChatGPT-4. Only natural language descriptions of what the prompts do are provided." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No hyperparameters are reported — no temperature, top-p, max tokens for API calls, no learning rate, batch size, or epochs for the fine-tuning of ChatGLM-6B." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The system consists of an Encoder and an Adjusted LLM, which are direct model calls, not agentic workflows." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "The 'Delete Command Dataset' creation process is not documented. The paper describes four groups of test cases (Direct, Multilingual, Varied Expression, Implication) but does not explain how the test cases were constructed, how many there are, or what criteria were used." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "No dedicated limitations section exists. The conclusion mentions future research directions (more efficient implementation, adapting to new attacks, integrating with other security measures) but does not discuss limitations of the current work." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed. The paper does not address any potential weaknesses of the evaluation or the method." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No scope boundaries are stated. The paper does not clarify what the results do NOT show, what attack types were NOT tested, or under what conditions the method might fail." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw data is available. The full 'Delete Command Dataset' is not released; only the first 3 examples per group are shown in Table 1." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": false, 191 "justification": "The data collection procedure for the 'Delete Command Dataset' is not described. The paper says it 'encompasses a variety of languages, diverse expressions, and implications' but provides no detail on how test cases were created or selected." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants are involved. The test dataset appears to be manually constructed but is not a standard benchmark." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "No data pipeline is documented. The path from dataset creation to experimental results has no documented intermediate steps, filtering criteria, or transformation logic." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information is disclosed anywhere in the paper. The author is affiliated with Hong Kong Polytechnic University but no grants or funding sources are mentioned." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "The author's affiliation is clearly disclosed: 'Department of Electrical and Electronic Engineering, The Hong Kong Polytechnic University, Hong Kong, China.'" 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence of funder cannot be assessed. The absence of a funding disclosure is itself a gap." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "The paper tests a defense mechanism (Signed-Prompt) rather than evaluating a pre-trained model's capability on a knowledge benchmark. Contamination is not relevant to this paper type." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "The paper tests a defense mechanism rather than evaluating model knowledge on a benchmark. Train/test overlap in the traditional contamination sense does not apply." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Not applicable — the paper evaluates a defense approach, not a model's knowledge on a pre-existing benchmark." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants are involved in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are involved in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants are involved in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants are involved in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants are involved in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants are involved in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants are involved in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference cost, API cost, latency, or per-example cost is reported for either the Encoder or the Adjusted LLM components." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No computational budget is stated. The fine-tuning of ChatGLM-6B and the API usage of ChatGPT-4 are not quantified in terms of GPU hours, API spend, or training time." 290 } 291 } 292 }, 293 "claims": [ 294 { 295 "claim": "The Signed-Prompt Encoder achieves near-perfect correctness in replacing 'delete' semantics across languages and expression types, with 100% for direct, multilingual, and varied expressions, and 96.67% for implications.", 296 "evidence": "Table 1 shows correctness rates for four groups: Direct (100%), Multilingual (100%), Varied Expression (100%), Implication (96.67%). Sample size per group is not stated.", 297 "supported": "weak" 298 }, 299 { 300 "claim": "LLMs with Signed-Prompt defense achieve 0% attack success rate across all four attack groups for both prompt engineering and fine-tuning implementations.", 301 "evidence": "Table 3 shows attacker success rate of 0% for both LLM-PE and LLM-FT across all four groups (Direct, Multilingual, Varied Exp, Implication).", 302 "supported": "weak" 303 }, 304 { 305 "claim": "The prompt engineering implementation (LLM-PE) achieves 100% correctness for signed user commands across all four groups.", 306 "evidence": "Table 3 shows LLM-PE correctness rate of 100% for Direct, Multilingual, Varied Exp, and Implication groups.", 307 "supported": "weak" 308 }, 309 { 310 "claim": "Signed-Prompt provides 'exceptional effectiveness' as a 'robust defense strategy' against prompt injection attacks.", 311 "evidence": "Based on testing a single command type ('delete') with its variants. No comparison to existing defenses, no adversarial stress testing, no testing of signature extraction attacks.", 312 "supported": "unsupported" 313 } 314 ], 315 "red_flags": [ 316 { 317 "flag": "Extremely narrow evaluation scope", 318 "detail": "The entire experimental evaluation tests only one command type ('delete') and its semantic variants. No other sensitive operations (data exfiltration, role manipulation, instruction override) are tested. Claims of 'robust defense' against 'various types of prompt injection attacks' are not supported by single-command testing." 319 }, 320 { 321 "flag": "No baseline comparisons", 322 "detail": "The paper criticizes existing defenses (output filtering, input filtering, delimiters, Dual LLM) in the introduction but never experimentally compares Signed-Prompt against any of them. It is impossible to assess relative effectiveness." 323 }, 324 { 325 "flag": "Claims significantly outrun evidence", 326 "detail": "The abstract and conclusion claim 'substantial resistance to various types of prompt injection attacks' and 'robust defense strategy,' but evidence comes from one command type on a custom dataset of unstated size. The gap between claims and evidence is severe." 327 }, 328 { 329 "flag": "Security-by-obscurity concern not addressed", 330 "detail": "The method relies on keeping the signature token ('toeowx') secret. The paper does not discuss what happens if the signature is leaked, extracted via prompt injection itself, or brute-forced. The statement 'If the user's unique deletion instruction signature is not leaked' acknowledges but does not address this fundamental limitation." 331 }, 332 { 333 "flag": "Unstated sample sizes", 334 "detail": "The total number of test cases per group is never reported. Table 1 shows only 'first 3 entries each group.' The implication group's 96.67% correctness suggests ~30 entries, but this is never confirmed. Results cannot be properly interpreted without knowing N." 335 }, 336 { 337 "flag": "Potential train/test overlap for fine-tuned model", 338 "detail": "LLM-FT was fine-tuned on data 'similar to the previous test data in Table 1' and then tested on the same groups. No explicit separation between fine-tuning data and test data is documented." 339 } 340 ], 341 "cited_papers": [ 342 { 343 "title": "Evaluating the susceptibility of pre-trained language models via handcrafted adversarial examples", 344 "authors": ["H. J. Branch", "J. R. Cefalu", "J. McHugh", "L. Hujer", "A. Bahl", "D. D. C. Iglesias", "R. Darwishi"], 345 "year": 2022, 346 "arxiv_id": "2209.02128", 347 "relevance": "Early work on adversarial attacks against pre-trained language models, foundational to the prompt injection threat model." 348 }, 349 { 350 "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", 351 "authors": ["S. Abdelnabi", "K. Greshake", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"], 352 "year": 2023, 353 "relevance": "Seminal paper on indirect prompt injection attacks against LLM-integrated applications, directly motivating the Signed-Prompt defense." 354 }, 355 { 356 "title": "Prompt Injection Attacks and Defenses in LLM-Integrated Applications", 357 "authors": ["Y. Liu", "Y. Jia", "R. Geng", "J. Jia", "N. Z. Gong"], 358 "year": 2023, 359 "arxiv_id": "2310.12815", 360 "relevance": "Systematic study of prompt injection attacks and defense mechanisms for LLM-integrated applications." 361 }, 362 { 363 "title": "Assessing Prompt Injection Risks in 200+ Custom GPTs", 364 "authors": ["J. Yu", "Y. Wu", "D. Shu", "M. Jin", "X. Xing"], 365 "year": 2023, 366 "arxiv_id": "2311.11538", 367 "relevance": "Large-scale assessment of prompt injection vulnerabilities in deployed LLM applications (Custom GPTs)." 368 }, 369 { 370 "title": "Tensor Trust: Interpretable Prompt Injection Attacks from an Online Game", 371 "authors": ["S. Toyer", "O. Watkins", "E. A. Mendes", "J. Svegliato", "L. Bailey", "T. Wang", "S. Russell"], 372 "year": 2023, 373 "arxiv_id": "2311.01011", 374 "relevance": "Crowdsourced collection and analysis of prompt injection attack strategies through gamification." 375 }, 376 { 377 "title": "Prompt Injection attack against LLM-integrated Applications", 378 "authors": ["Y. Liu", "G. Deng", "Y. Li", "K. Wang", "T. Zhang", "Y. Liu"], 379 "year": 2023, 380 "arxiv_id": "2306.05499", 381 "relevance": "Analysis of prompt injection attack patterns against LLM-integrated applications." 382 } 383 ] 384 }