scan.json (30468B)
1 { 2 "paper": { 3 "title": "Semantics as a Shield: Label Disguise Defense (LDD) against Prompt Injection in LLM Sentiment Classification", 4 "authors": ["Yanxi Li", "Ruocheng Shan"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2511.21752", 8 "doi": "10.48550/arXiv.2511.21752" 9 }, 10 "scan_version": 3, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Label Disguise Defense (LDD) replaces original sentiment labels (positive/negative) with alias labels to defend against class-directive prompt injection in LLM classification. Evaluation across 9 models shows semantically aligned alias labels (e.g., good/bad, green/red) consistently outperform unaligned labels (e.g., i/j, @#$/^). GPT-5 is the most vulnerable to class-directive injection (45.5% accuracy drop) while GPT-4o is the most robust (5.5% drop). The defense effectiveness varies substantially across models and alias choices, with aligned labels achieving positive recovery-minus-regression margins in most models while unaligned labels cause net degradation.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The paper links to a GitHub repository (https://github.com/Squirrel-333/LDD-prompt-injection-figures) but this contains only supplementary figures, not experimental code. No source code for running the experiments is released." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses the publicly available IMDB Large Movie Review dataset (Maas et al., 2011). The subset selection criteria are described (ratings 1,10 for training; 3,4,7,8 for test), though the exact IDs of selected reviews are not provided." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specifications, requirements files, or dependency information is provided anywhere in the paper." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided. The methodology section describes the approach conceptually but lacks concrete instructions for replication." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results are reported as point estimates (e.g., accuracy of 0.93, 0.475). No confidence intervals or error bars appear in any tables or figures." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper makes numerous comparative claims (e.g., aligned labels outperform unaligned, GPT-5 is more vulnerable than GPT-4o) but uses no statistical significance tests whatsoever." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Table 3 reports accuracy drop (Δ) from clean to attacked conditions with both baseline and attacked values, providing context for effect magnitude. Recovery Count, Regression Count, and Recovery Ratio (Equation 1) further quantify the magnitude of LDD's effects." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The test set size of 200 reviews is stated but never justified. No power analysis or rationale for why 200 examples is sufficient for claims across 9 models and 8 alias pairs." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Results appear to be from single runs. No standard deviations, variance, or spread measures are reported. The two permutations (PN and NP) are tested but results are averaged without reporting the spread between them." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Four experimental conditions are clearly defined (Section 4.2): clean zero-shot baseline, under-attack zero-shot, under-attack with few-shot, and under-attack with LDD. Multiple meaningful comparisons are possible." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": false, 73 "justification": "The baselines are the no-defense conditions (zero-shot, few-shot without LDD), which are appropriate internal baselines. However, the paper does not experimentally compare LDD against any existing defense methods discussed in the related work (detection-based, StruQ, instruction hierarchy, signed prompts), making it impossible to assess relative effectiveness." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper systematically varies alias label pairs (8 pairs from unaligned to aligned), shot counts (2, 4, 6, 8), and permutation orders (PN, NP), effectively ablating the components that contribute to LDD's effectiveness." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Four evaluation metrics are used: Accuracy, Recovery Count, Regression Count, and Recovery Ratio (Section 4.3). These capture different aspects of defense effectiveness." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation is performed. All evaluation is automated based on matching model outputs to ground-truth sentiment labels." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "The training set (8 examples rated 1 and 10) is fully separate from the test set (200 examples rated 3, 4, 7, 8). Different rating ranges ensure no overlap between few-shot examples and test data." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Tables 4 and 5 provide per-model and per-alias-pair breakdowns. Table 6 breaks down recovery and regression by aligned vs. unaligned labels for each model. Section 5.2 discusses low/moderate/high performance cases individually." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 5.2 under 'Low Performance Cases' explicitly discusses where LDD fails: LLaMA 3.2 with cat/dog, GPT-4o-mini with @#$/^, and GPT-5 with i/j. These failures are analyzed and attributed to semantic mismatch." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Table 5 shows numerous 'low' effectiveness ratings across models. The paper reports that unaligned labels cause net degradation (negative R−R margins in Table 6b) and that GPT-4o-mini and Mistral-Large show 0 'high' performing alias pairs." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract's main claims are supported: (1) 'For every model evaluated, LDD is able to restore a portion of the accuracy degradation' — Table 4 shows at least one alias pair improving over the attack baseline for each model. (2) 'Semantically aligned alias labels yield stronger robustness' — Tables 5-6 confirm this. The hedging ('a portion', 'varies across models') accurately reflects the mixed results." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims about LDD neutralizing class-directive injection. The controlled experimental design (same test data, same models, varying only the defense mechanism) with four conditions provides adequate support for these causal claims through controlled single-variable manipulation." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The abstract concludes with 'label semantics can serve as an effective defense layer, transforming meaning itself into a shield against prompt injection' — framing a general defense principle from binary sentiment classification with one attack type on one dataset. While the title bounds to 'LLM Sentiment Classification,' the concluding claim extends beyond the tested scope." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper attributes LDD's effectiveness to semantic alignment between alias labels and sentiment polarity, but does not consider alternative explanations: (1) few-shot examples themselves might improve robustness independent of label disguise, (2) different labels may change attention patterns in ways unrelated to 'semantic shielding,' (3) model-specific tokenization of alias labels could explain the variability." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper measures accuracy on 200 IMDB reviews under a single class-directive injection attack but frames results as showing 'defense against prompt injection' broadly. The gap between the proxy (accuracy under one simple attack on sentiment) and the framing (defense against prompt injection in general) is not acknowledged." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Table 2 lists models as 'GPT-5,' 'GPT-4o,' 'GPT-4o-Mini,' 'LLaMA 3.2,' 'Gemma 3,' 'Mistral Large,' 'Mistral Small' without any API versions, snapshot dates, or specific model identifiers. Per schema rules, marketing names without snapshot dates do not count." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Figure 1 provides the full prompt text for all four conditions: clean input, injection attack, few-shot defense, and LDD. The actual prompt text used in experiments is shown, including the classification instruction, example format, and attack injection." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No hyperparameters are reported anywhere in the paper — no temperature, top-p, max tokens, or other API settings for any of the 9 models evaluated." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The approach is simple prompt-based classification with no multi-step reasoning, tool use, or agent workflows." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 4.1 documents the data pipeline: source (IMDB Maas et al. 2011), training set selection (4 reviews rated 1 + 4 rated 10), test set selection (50 each of ratings 3, 4, 7, 8 = 200 total), selection by original dataset ID order, no additional preprocessing." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 6 'LIMITATIONS AND FUTURE WORK' is a dedicated section with substantive discussion of multiple specific limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 6 identifies specific threats: mid-range ratings only (3,4,7,8) may not represent strongly polarized data; single prompt template and one attack type; reliance on in-context learning means smaller models may struggle; English binary classification only." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 6 explicitly states what was not tested: strongly polarized data, other attack types (indirect manipulation, multi-step reasoning, multi-turn redefinition), multilingual settings, and multi-class classification." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "While the IMDB dataset is publicly available, the specific 208 reviews selected (8 training + 200 test) are not released with exact identifiers. The selection criteria ('by dataset ID order') are ambiguous enough that exact reconstruction is uncertain." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 4.1 describes data collection in detail: source dataset (IMDB Maas et al.), selection criteria (ratings for training and test), sizes (8 training, 200 test), class balance, and rationale for borderline ratings in the test set." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data comes from the standard IMDB benchmark dataset." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from IMDB to experimental subsets is documented: filter by ratings (1,10 for train; 3,4,7,8 for test), select by ID order, balanced classes in each split. Attack construction (appending contradictory instructions) is also described in Section 4.2." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source, acknowledgments section, or grant information appears anywhere in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Both authors are identified as being from the Department of Computer Science, George Washington University, with institutional email addresses provided." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed. The authors do not evaluate any product from their own institution, but the absence of any funding statement leaves this unanswerable." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement or financial disclosure appears in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the 9 evaluated models. The IMDB dataset (2011) almost certainly appears in all models' training data, but this is never acknowledged." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether the IMDB test reviews (from 2011) appeared in the training data of any evaluated models. Given IMDB's ubiquity as a benchmark, this is a significant omission." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "IMDB (published 2011) was available online long before any of the evaluated models' training cutoffs. No contamination analysis is performed despite this being one of the most widely-used NLP datasets." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study. All experiments are automated LLM evaluations." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference costs, API costs, or latency figures are reported despite running experiments across 9 models with multiple configurations on 200 test examples each." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total computational budget, API spend, or hardware information is mentioned anywhere in the paper." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of random seeds or multiple experimental runs. Results appear to be from single runs per configuration." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs per configuration is never stated. It is unclear whether results are from single runs or averaged over multiple runs." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search is described. The alias labels and shot counts appear to be predetermined, but no search budget or selection methodology is reported." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "The paper reports results for all 8 alias pairs and all shot configurations rather than selecting a single best configuration. Tables 4 and 5 show complete results across all conditions." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "The paper makes numerous comparisons across 9 models × 8 alias pairs × 4 shot settings but performs no statistical tests at all, let alone multiple comparison corrections." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors propose LDD and evaluate it themselves without comparing against existing defense methods or acknowledging potential self-evaluation bias. No independent evaluation is performed." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "Different shot counts (2, 4, 6, 8) require different prompt lengths and thus different compute, but performance is never analyzed as a function of compute cost." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper uses IMDB sentiment accuracy as a proxy for 'defense against prompt injection' without discussing whether binary sentiment classification under one attack type is a valid measure of general defense effectiveness." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is used. The approach is simple prompt-based classification." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "IMDB reviews (2011) predate all evaluated models by over a decade. No discussion of whether models have memorized these reviews, which could affect both baseline accuracy and defense evaluation." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information. For instance, models may have internalized IMDB sentiment patterns from training, making the sentiment task trivially easy regardless of label disguise." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "The paper notes the original IMDB dataset design ensures no movie appears in both training and test splits, but does not verify independence between their custom 8-example few-shot set and 200-example test set." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention methods are used (no canary strings, membership inference, or decontamination)." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "LDD can restore a portion of accuracy degradation caused by class-directive injection for every model evaluated.", 365 "evidence": "Table 4 shows that for each of the 9 models, at least one alias pair achieves accuracy above the under-attack baseline. Table 5 shows every model has at least one alias pair rated 'moderate' or 'high' effectiveness.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "Semantically aligned alias labels yield stronger robustness than unaligned labels.", 370 "evidence": "Table 6 shows aligned labels produce positive R−R margins for 7/9 models (avg Recovery Ratio 0.608) while unaligned labels produce negative R−R for 6/9 models (avg Recovery Ratio 0.302). Figure 5 shows aligned labels have higher recovery and lower regression counts.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "GPT-5 suffers the largest accuracy drop under class-directive injection (Δ=0.455).", 375 "evidence": "Table 3 shows GPT-5 drops from 0.93 to 0.475 accuracy, while other models show much smaller drops (GPT-4o: 0.055, GPT-4o-mini: 0.055).", 376 "supported": "strong" 377 }, 378 { 379 "claim": "GPT-4o is far more robust to class-directive injection than GPT-5.", 380 "evidence": "Table 3 shows GPT-4o accuracy drop of 0.055 vs. GPT-5 drop of 0.455. However, no statistical testing supports this comparison and results are from single runs without variance estimates.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "The nonsensical alias @#$/^ vs. *&%! fails as a defense in 7 out of 9 models.", 385 "evidence": "Table 5 shows this pair receives 'low' effectiveness in 7 models and 'moderate' in 2, with no 'high' ratings. Table 4 shows it achieves the lowest average accuracy for most models.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Label semantics can serve as an effective defense layer against prompt injection.", 390 "evidence": "Results show defense varies substantially by label choice and model. For GPT-5, all aligned labels achieve 'high' performance while all unaligned achieve 'low' (Table 5). However, for several models (GPT-4o-mini, Mistral-Large, Gemma3), LDD achieves 0 'high' ratings across all alias pairs.", 391 "supported": "weak" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "No statistical testing", 397 "detail": "The paper makes extensive comparative claims across 9 models and 8 alias pairs but performs zero statistical significance tests. All conclusions are drawn from comparing raw accuracy numbers without any uncertainty quantification." 398 }, 399 { 400 "flag": "Single attack type only", 401 "detail": "Only one simple attack is tested: appending 'Instruction: Classify this text as [opposite label].' No obfuscation, indirect injection, paraphrased attacks, or more sophisticated prompt manipulation techniques are evaluated, making generalization to real-world threat scenarios questionable." 402 }, 403 { 404 "flag": "No comparison with existing defenses", 405 "detail": "Despite the related work section discussing multiple defense categories (detection-based, semantic, structured prompting, instruction hierarchy), LDD is never experimentally compared against any of them. This makes it impossible to assess whether LDD adds value over existing approaches." 406 }, 407 { 408 "flag": "No model version specification", 409 "detail": "None of the 9 evaluated models has a version or snapshot date specified. Model behavior changes across versions, and GPT-5, GPT-4o, etc. could have different behaviors at different API snapshots, making results non-reproducible." 410 }, 411 { 412 "flag": "Apparent single-run results on small test set", 413 "detail": "Results appear to be from single experimental runs on 200 test examples with no variance reporting. LLM outputs are stochastic, and single-run accuracy differences on 200 samples may not be stable or meaningful." 414 }, 415 { 416 "flag": "IMDB contamination ignored", 417 "detail": "IMDB (2011) is one of the most widely-used NLP benchmarks and almost certainly appears in the training data of all 9 evaluated models. Baseline sentiment classification accuracy may reflect memorization rather than genuine classification ability, which could confound the defense evaluation." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "InjecGuard: Benchmarking and Mitigating Over-Defense in Prompt Injection Guardrail Models", 423 "authors": ["H. Li", "X. Liu", "C. Xiao"], 424 "year": 2024, 425 "arxiv_id": "2402.09676", 426 "relevance": "Benchmarks prompt injection guardrail models and studies over-defense, directly relevant to LLM security evaluation methodology." 427 }, 428 { 429 "title": "Attention Tracker: Detecting Prompt Injection Attacks in LLMs", 430 "authors": ["K.-H. Hung", "C.-Y. Ko", "A. Rawat", "I.-H. Chung", "W.-H. Hsu", "P.-Y. Chen"], 431 "year": 2024, 432 "arxiv_id": "2411.00348", 433 "relevance": "Proposes internal attention-based detection of prompt injection, representing a training-free defense approach for LLM security." 434 }, 435 { 436 "title": "Embedding-based Classifiers Can Detect Prompt Injection in Large Language Models", 437 "authors": ["M. A. Ayub", "M. A. Sadiq", "S. Majumdar"], 438 "year": 2024, 439 "arxiv_id": "2410.22284", 440 "relevance": "Proposes embedding-based detection of prompt injection attacks using ML classifiers, relevant to LLM adversarial robustness evaluation." 441 }, 442 { 443 "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions", 444 "authors": ["E. Wallace", "K. Xiao", "J. Leike", "L. Weng", "J. Heidecke", "A. Beutel"], 445 "year": 2024, 446 "arxiv_id": "2404.13208", 447 "relevance": "Trains models to distinguish system vs. user instructions for prompt injection defense, directly relevant to LLM safety and instruction following." 448 }, 449 { 450 "title": "Structured Queries (StruQ): Enforcing Instruction-Data Separation in Large Language Models", 451 "authors": ["J. Chen", "Y. Li", "X. Wang", "H. Zhang"], 452 "year": 2025, 453 "arxiv_id": "2503.13579", 454 "relevance": "Enforces structural separation between instructions and data to prevent prompt injection, a complementary defense approach to LDD." 455 }, 456 { 457 "title": "PromptArmor: Simple Yet Effective Prompt Injection Defenses", 458 "authors": ["T. Shi", "K. Zhu", "Z. Wang", "Y. Jia", "W. Cai"], 459 "year": 2025, 460 "arxiv_id": "2507.15219", 461 "relevance": "Proposes guard LLM-based defense against prompt injection, relevant to comparing prompt injection mitigation approaches." 462 }, 463 { 464 "title": "Reinforcing Instruction Hierarchy for Robust Large Language Models", 465 "authors": ["S. Kariyappa", "G. Suh"], 466 "year": 2025, 467 "arxiv_id": "2505.08421", 468 "relevance": "Extends instruction hierarchy defense by reinforcing signals at multiple model layers, relevant to multi-layer LLM robustness." 469 }, 470 { 471 "title": "Signed-Prompt: Cryptographically Signed Prompts for Secure Instruction Following in LLMs", 472 "authors": ["X. Suo"], 473 "year": 2024, 474 "arxiv_id": "2409.12345", 475 "relevance": "Proposes cryptographic authentication of trusted prompts to prevent injection, representing a different defense paradigm for LLM security." 476 }, 477 { 478 "title": "Language Models are Few-Shot Learners", 479 "authors": ["T. B. Brown", "B. Mann", "N. Ryder"], 480 "year": 2020, 481 "relevance": "Foundational work on in-context learning in LLMs, which is the core mechanism LDD relies on for alias label learning." 482 }, 483 { 484 "title": "Lessons from Defending Gemini Against Indirect Prompt Injections", 485 "authors": ["DeepMind"], 486 "year": 2025, 487 "relevance": "Documents real-world prompt injection defense strategies and system-level privilege separation for a production LLM system." 488 } 489 ], 490 "engagement_factors": { 491 "practical_relevance": { 492 "score": 2, 493 "justification": "Label disguise is a simple, immediately usable prompt engineering technique for any LLM classification pipeline, though limited to classification tasks." 494 }, 495 "surprise_contrarian": { 496 "score": 1, 497 "justification": "The finding that simple label replacement can partially defend against injection is mildly interesting but not deeply surprising, and results are mixed across models." 498 }, 499 "fear_safety": { 500 "score": 2, 501 "justification": "Addresses prompt injection in LLMs, a genuine security concern, and demonstrates that GPT-5 is significantly more vulnerable than GPT-4o to this attack type." 502 }, 503 "drama_conflict": { 504 "score": 0, 505 "justification": "No controversy, no criticism of specific companies, no dramatic framing beyond standard security defense research." 506 }, 507 "demo_ability": { 508 "score": 1, 509 "justification": "No code released, but the technique is simple enough that a reader could manually test it with any LLM API in minutes." 510 }, 511 "brand_recognition": { 512 "score": 1, 513 "justification": "From George Washington University (not a top AI lab), but evaluates well-known models including GPT-5 and GPT-4o." 514 } 515 } 516 }