ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (32695B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Defending Against Prompt Injection With a Few DefensiveTokens",
      6     "authors": [
      7       "Sizhe Chen",
      8       "Yizhu Wang",
      9       "Nicholas Carlini",
     10       "Chawin Sitawarin",
     11       "David Wagner"
     12     ],
     13     "year": 2025,
     14     "venue": "AISec@CCS",
     15     "arxiv_id": "2507.07974",
     16     "doi": "10.1145/3733799.3762982"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims DefensiveToken achieves security 'comparable to training-time alternatives' are supported by Table 3 results (0.24% avg ASR on TaskTracker vs. 0.20–0.51% for training-time defenses). Flexibility and minimal utility drop claims are supported by Table 3 WinRate columns and Figure 3.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims (DefensiveToken reduces ASR, improves security) are supported by controlled ablation studies in Section 4.5 varying number of tokens, initialization, loss function, position, and learning rate. Comparisons with and without DefensiveTokens isolate the causal contribution.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "All experiments use 4 open-weight 7–8B models only. The paper does not bound generalization claims to this model-size range, and conclusions like 'comparable to training-time defenses' implicitly extend beyond the tested scope without acknowledging inapplicability to larger or closed models.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper presents a single mechanistic hypothesis for why DefensiveToken works (large embedding magnitude enabling optimization, Table 2) without considering alternatives such as attention-head anchoring, semantic priming, or training-data distribution shift.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Metrics directly correspond to claims: ASR measures security against prompt injection, WinRate measures utility. No proxy conflation—the paper does not call ASR a stand-in for a broader safety property it doesn't measure.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations section. Limitations appear only in the conclusion paragraph: 'DefensiveToken only defends against prompt injections...does not apply to other safety settings...we do not know the utility on more labeled datasets.' A concluding paragraph does not qualify.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats-to-validity section exists. The conclusion briefly mentions utility uncertainty on unlabeled datasets, but does not discuss specific threats such as LLM judge reliability, benchmark saturation, or architecture dependency.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The conclusion explicitly states: 'DefensiveToken only defends against prompt injections, where the user is benign and the environment is malicious. DefensiveToken does not apply to other safety settings, e.g., preventing jailbreaks, system following attacks, and data extraction attacks.'",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Funding is explicitly listed in acknowledgments: Google-BAIR Commons, NSF (grant 2229876), OpenAI, Open Philanthropy, Google, the Department of Homeland Security, and IBM.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are listed in the paper header: UC Berkeley (Chen, Wang, Wagner), Google DeepMind and Anthropic (Carlini), Google DeepMind (Sitawarin).",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Google and OpenAI are funders; a Google DeepMind employee (Carlini) is co-author; and gpt-4o (OpenAI's model) is the primary LLM judge throughout evaluation. This creates alignment between funder interests and both the research direction and evaluation tooling.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Funding sources are acknowledged but there is no explicit 'competing interests' or 'financial interests' statement. No declaration about patents, equity, or consulting relationships is included.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are precisely defined: prompt injection is defined as inserting malicious instructions into data consumed by an LLM; the threat model is formally specified in Section 3.1; ASR is defined; the [INST]/[DATA]/[RESP] prompt format is shown with concrete examples.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The contribution is clearly stated: 'We introduce DefensiveToken, the first test-time prompt injection defense that is as effective as training-time ones in most cases.' Table 1 situates it against all existing defense categories across three axes.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 explicitly situates DefensiveToken against attack types (optimization-free vs. optimization-based), defense categories (detection vs. prevention, test-time vs. training-time), and prior methods (StruQ, SecAlign, ISE, Jatmo, instruction hierarchy, prompt tuning), explaining how DefensiveToken differs from each.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The abstract states 'The code is available here' with a hyperlink. The paper references specific implementation components (peft library, PyTorch FSDP) consistent with a released codebase.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All evaluation datasets are publicly available standard benchmarks: AlpacaFarm, SEP, TaskTracker (31K samples), CyberSecEval2, InjecAgent, and Cleaned Alpaca (training data). No proprietary data was created.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions 'four NVIDIA Tesla A100s (80GB) with PyTorch FSDP' and the peft library, but provides no requirements.txt, Dockerfile, or version-pinned dependency list.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Algorithm 1 provides the optimization procedure at a high level and key hyperparameters are reported, but no step-by-step execution instructions are provided. Users would need to substantially infer how to run the experiments.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Main results in Table 3 (all 24 model-benchmark combinations) are single-run point estimates without error bars. Variance is reported for only one configuration: SEP/Llama3.1-8B over 5 runs (WinRate 53.84±0.56, ASR 2.81±1.09).",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are reported for any comparative claim. All differences between methods are presented as raw percentage differences without p-values or confidence intervals.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Effect sizes are clearly quantified: DefensiveToken reduces GCG ASR from 95.2% to 48.8%; reduces optimization-free ASR by an order of magnitude versus no defense on major benchmarks; WinRate drops of less than 2pp for utility. Baselines provide meaningful reference points throughout.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No justification is given for the choice of 4 models or these specific benchmarks. No power analysis is provided. The selection of 5 defensive tokens is empirically justified (Table 4) but not via pre-study power calculation.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Variance is only reported for one model-benchmark combination (Llama3.1-8B on SEP, 5 runs). All other results in Table 3 are single-run point estimates.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Multiple baselines are included: test-time (Reminder, Sandwich, TextGrad) and training-time (StruQ-LoRA, StruQ-Full, SecAlign-LoRA), plus 'no defense' baseline, all evaluated across the same 5 benchmarks and 4 models.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines are current: StruQ (2025), SecAlign (2025), TextGrad (2025), Instruction Hierarchy (2024, gpt-4o). The paper also explicitly justifies excluding detection-based and system-level defenses as out-of-scope for the threat model.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 4.5 provides comprehensive ablation studies: number of tokens (Table 4), initialization strategy (Table 5), loss function (Table 6), insertion position (Table 7), and learning rate (Table 8). Multiple design choices are varied independently.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Both security (ASR for optimization-free attacks and GCG-ASR for optimization-based) and utility (WinRate on AlpacaEval2) metrics are reported. Figure 3 visualizes the Pareto utility-security trade-off explicitly.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Human evaluation is not applicable here. Utility is measured via LLM judge (gpt-4o through AlpacaEval2) and security via automated string matching or LLM judge. This is an automated security defense evaluation without system outputs requiring human judgment.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "DefensiveToken optimization uses Cleaned Alpaca (51K training samples), explicitly stated to be 'different and in another domain from' the AlpacaFarm evaluation set. 'The user instructions and injections in evaluation have no overlap with those used in model training.'",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Table 3 provides full per-model (4 models) and per-benchmark (5 benchmarks) breakdowns for all methods, enabling fine-grained analysis of where DefensiveToken succeeds or underperforms.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Failure cases are explicitly discussed: GCG attacks still achieve 48.8% avg ASR (vs. near-0% for StruQ-Full); Falcon3-7B requires 20 tokens for near-zero ASR vs. 5 for other models; InjecAgent agentic setting shows DefensiveToken is 'slightly weaker than training-time alternatives.'",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Negative results are honestly reported: SecAlign loss applied to token embeddings catastrophically hurts utility (Table 6, ~10pp WinRate drop); insertion at end of input significantly degrades both utility and security (Table 7); lr=0.01 fails to achieve security (Table 8).",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific model identifiers are provided: Llama3-8B-Instruct, Llama3.1-8B-Instruct, Falcon3-7B-Instruct, Qwen2.5-7B-Instruct. The LLM judge is identified as gpt-4o with AlpacaEval2 reference model version turbo-2024-04-09.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Full prompts and injection examples are reproduced for all 4 attack variants (Ignore, Completion, Ignore-Completion, GCG) across all 5 benchmarks, including the exact [INST]/[DATA]/[RESP] format and complete injection text.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Full hyperparameters reported: learning rate 0.1, 1 epoch, 5 tokens, LoRA parameters (r=64, lora_alpha=8, lora_dropout=0.1, target_modules=['q_proj','v_proj']), Cleaned Alpaca 51K training samples, and batch training via PyTorch FSDP.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "There is no agentic scaffolding in the core DefensiveToken evaluation. InjecAgent uses ReAct prompts from an existing benchmark, not custom scaffolding designed by the authors.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "The defensive training dataset construction is described in Algorithm 1 and Section 3.3: self-labeled responses from undefended LLM, 50% samples unchanged, 25% each with two injection variants. Injection placement rules per benchmark are also specified.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Security evaluation relies on gpt-4o judge decisions which are not publicly released, making independent verification of ASR numbers impossible. Only aggregate statistics are presented; individual judge responses are unavailable.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Data collection is described: standard benchmarks used with injections appended per benchmark-specific rules (end-of-data for SEP, benchmark-specified placement for TaskTracker). Training data generation from Cleaned Alpaca is described in Algorithm 1.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants were recruited. All evaluation uses standard benchmark datasets and LLM judges.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline from training data construction (Algorithm 1) through DefensiveToken optimization to evaluation (LLM judge via AlpacaEval2 for utility, gpt-4o for security, string matching for AlpacaFarm) is documented at a level sufficient to understand the full flow.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoffs for Llama3, Llama3.1, Falcon3, and Qwen2.5 base models are not stated. The paper only verifies that the defensive token optimization data doesn't overlap with evaluation benchmarks, not whether base model pre-training included evaluation examples.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The paper states 'The user instructions and injections in evaluation have no overlap with those used in model training' (for defensive token optimization), but does not discuss potential overlap between the base LLMs' pre-training data and evaluation benchmarks like AlpacaFarm or SEP.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "Contamination of the base LLMs' pre-training data with evaluation benchmarks is not addressed. This matters for utility measurement—if Llama3 or Qwen2.5 pre-training included AlpacaFarm examples, the WinRate baseline would be inflated.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "The paper notes DefensiveTokens add 'only 5 more tokens' to input but provides no quantitative inference latency or throughput measurements. The overhead is described qualitatively as 'minimal changes to the LLM system' without measured numbers.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Training compute is reported: 'four NVIDIA Tesla A100s (80GB) with PyTorch FSDP and takes one hour to complete.' This provides actionable compute budget information.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "DefensiveToken achieves security comparable to training-time defenses on optimization-free prompt injection attacks, with 0.24% average ASR on TaskTracker vs. 0.20–0.51% for training-time alternatives.",
    375       "evidence": "Table 3 and Figure 2: DefensiveToken achieves 0.24% avg ASR on TaskTracker across 4 models; StruQ-LoRA 0.24%, StruQ-Full 0.20%, SecAlign-LoRA 0.51%. Similar results on AlpacaFarm and SEP benchmarks.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "DefensiveToken reduces optimization-free ASR by an order of magnitude compared to prompting-based test-time defenses (Reminder, Sandwich), which 'never reduce ASRs by over two times' across all benchmarks.",
    380       "evidence": "Figure 2 and Table 3: Reminder achieves 19.8–35.3% ASR on TaskTracker while DefensiveToken achieves 0.19–0.27%; on AlpacaFarm, Sandwich reduces ASR only to 56.7–85.6% while DefensiveToken achieves 0.48–4.81%.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "DefensiveToken incurs smaller utility loss than all tested baselines (test-time and training-time) because only 5 additional tokens are added to the input.",
    385       "evidence": "Figure 3 shows DefensiveToken is closest to the ideal defense on both AlpacaFarm and SEP. Table 3 WinRate column shows <2pp utility drop versus no-defense baseline in most cases, outperforming TextGrad (−6 to −19pp), StruQ-Full (Falcon3: −5pp), and SecAlign (−2 to −9pp).",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Against adaptive GCG optimization-based attacks (attacker knows DefensiveToken embeddings), DefensiveToken reduces average ASR from 95.2% to 48.8% while prompting-based baselines achieve near-100% ASR.",
    390       "evidence": "Figure 2 (GCG sub-figure): Reminder/Sandwich achieve 96.6–100% GCG ASR; DefensiveToken achieves 37.5% (Llama3.1-8B) to 73.6% (Qwen2.5-7B) GCG ASR averaged across 4 models.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Optimized DefensiveToken embeddings have 100x larger L1-norm magnitude than vocabulary token embeddings, making prompting-based approaches unable to replicate their security effectiveness.",
    395       "evidence": "Table 2: vocabulary tokens average L1-norm 34 (max 47) vs. DefensiveTokens average 4332 (max 4594) for Llama-3.1-8B-Instruct. Correlation with security is argued but not causally established.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "5 defensive tokens are sufficient for all 4 tested models to achieve near-optimal security; random initialization outperforms text-embedding initialization due to facilitating larger optimization magnitude.",
    400       "evidence": "Table 4: 5 tokens achieve 0.48% ASR on Llama3/3.1-8B (same as 20 tokens). Table 5: random initialization achieves 0.48% ASR vs. space initialization's 2.40% with 5 tokens on Llama3.1-8B. Falcon3-7B exception noted.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval"
    406   ],
    407   "key_findings": "DefensiveToken proposes optimizing special token embeddings (not model weights) that can be optionally prepended to LLM inputs at test time to achieve prompt injection robustness comparable to training-time fine-tuning while preserving developer flexibility to skip the defense when utility is prioritized. On 5 benchmarks with 4 open-weight 7–8B LLMs, DefensiveToken reduces optimization-free attack success rates by orders of magnitude over prompting-based baselines (0.24% avg ASR on TaskTracker vs. 11–30% for existing test-time defenses) while incurring less utility loss than all competing methods. The defense achieves a superior utility-security Pareto frontier, sitting closest to the ideal defense in Figure 3. The main remaining gap is against adaptive optimization-based attacks (GCG), where DefensiveToken reduces ASR from 95.2% to 48.8% while training-time defenses achieve near-zero—a limitation honestly reported.",
    408   "red_flags": [
    409     {
    410       "flag": "LLM judge reliance without reliability check",
    411       "detail": "Security evaluation throughout relies on gpt-4o to determine attack success. No inter-rater reliability between the LLM judge and human annotators is reported. OpenAI is also a named funder, creating alignment between funder interests and evaluation tooling."
    412     },
    413     {
    414       "flag": "Variance reported for only one configuration",
    415       "detail": "Five-run variance is reported only for SEP/Llama3.1-8B-Instruct. All other results in Table 3 (24 model-benchmark combinations) are single-run point estimates without error bars, making claimed comparisons statistically uninformative."
    416     },
    417     {
    418       "flag": "Results restricted to 7–8B open-weight models",
    419       "detail": "All four tested models are 7B or 8B parameters. Generalizability to larger models (70B+) or closed frontier models (GPT-4, Claude) is untested, and deployment requires providers to optimize and release DefensiveTokens alongside their models—a step that has not occurred in practice."
    420     },
    421     {
    422       "flag": "Funder conflict with judge tool",
    423       "detail": "OpenAI is a named funder and gpt-4o (OpenAI's model) is the primary LLM judge throughout evaluation. Google is a named funder and a Google DeepMind employee is co-author. No competing interests statement is present."
    424     },
    425     {
    426       "flag": "Code URL not visible in paper text",
    427       "detail": "The abstract states 'The code is available here' with a hyperlink, but the URL is not printed in the paper text, making code availability unverifiable from the paper alone."
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "StruQ: Defending against prompt injection with structured queries",
    433       "relevance": "Direct predecessor and primary baseline; DefensiveToken uses StruQ's defensive loss function and training dataset construction methodology"
    434     },
    435     {
    436       "title": "SecAlign: Defending Against Prompt Injection with Preference Optimization",
    437       "relevance": "Key training-time baseline using preference optimization; SecAlign loss is ablated against StruQ loss in DefensiveToken context"
    438     },
    439     {
    440       "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents",
    441       "relevance": "Agentic tool-calling benchmark used to test generalization of DefensiveToken to API-calling/ReAct settings beyond instruction-following"
    442     },
    443     {
    444       "title": "Get my drift? Catching LLM Task Drift with Activation Deltas",
    445       "relevance": "TaskTracker benchmark (31K samples) used as the largest primary security evaluation dataset"
    446     },
    447     {
    448       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    449       "relevance": "GCG attack method used to evaluate DefensiveToken robustness under strong adaptive optimization-based attacks"
    450     },
    451     {
    452       "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions",
    453       "relevance": "Training-time defense implemented in frontier models (gpt-4o, gemini-2.5-flash); contextualizes where DefensiveToken sits relative to provider-level solutions"
    454     },
    455     {
    456       "title": "Defeating prompt injections by design",
    457       "relevance": "System-level defense baseline; excluded from main comparison because it applies only to agentic control-flow cases"
    458     },
    459     {
    460       "title": "Can LLMs Separate Instructions From Data? And What Do We Even Mean By That?",
    461       "relevance": "SEP benchmark (9.1K samples) used in both security and utility-security trade-off evaluation"
    462     }
    463   ],
    464   "engagement_factors": {
    465     "practical_relevance": {
    466       "score": 3,
    467       "justification": "Directly addresses the OWASP #1 LLM threat with a deployable defense requiring 1 hour of training and 5 extra tokens at inference time with no infrastructure changes."
    468     },
    469     "surprise_contrarian": {
    470       "score": 2,
    471       "justification": "The finding that test-time token embedding optimization can match training-time fine-tuning security challenges the prevailing assumption that parameter updates are necessary for robust prompt injection defense."
    472     },
    473     "fear_safety": {
    474       "score": 2,
    475       "justification": "Addresses prompt injection attacks that have compromised real-world LLM products (Google Bard, Slack AI, Anthropic/OpenAI web agents), ranked #1 OWASP LLM threat, with growing agentic deployment surface."
    476     },
    477     "drama_conflict": {
    478       "score": 1,
    479       "justification": "Understated positioning of Berkeley academic work against industry (Google/OpenAI) funded training-time solutions, with co-authors spanning both sides."
    480     },
    481     "demo_ability": {
    482       "score": 2,
    483       "justification": "Code is reportedly released, training takes 1 hour on 4 A100s with standard open-weight models, making the defense feasibly reproducible by well-resourced practitioners."
    484     },
    485     "brand_recognition": {
    486       "score": 2,
    487       "justification": "Nicholas Carlini (Google DeepMind/Anthropic) is a prominent ML security researcher; David Wagner (UC Berkeley) is a well-known security faculty member; published at AISec@CCS, the top AI security workshop."
    488     }
    489   },
    490   "hn_data": {
    491     "threads": [
    492       {
    493         "hn_id": "40938701",
    494         "title": "Training a time series model using transformers at Datadog",
    495         "points": 27,
    496         "comments": 0,
    497         "url": "https://news.ycombinator.com/item?id=40938701",
    498         "created_at": "2024-07-11T17:19:07Z"
    499       },
    500       {
    501         "hn_id": "32218471",
    502         "title": "Drivable Volumetric Avatars Using Texel-Aligned Features",
    503         "points": 3,
    504         "comments": 0,
    505         "url": "https://news.ycombinator.com/item?id=32218471",
    506         "created_at": "2022-07-24T22:36:31Z"
    507       },
    508       {
    509         "hn_id": "44553930",
    510         "title": "Defending Against Prompt Injection with a Few DefensiveTokens",
    511         "points": 2,
    512         "comments": 0,
    513         "url": "https://news.ycombinator.com/item?id=44553930",
    514         "created_at": "2025-07-13T21:32:40Z"
    515       },
    516       {
    517         "hn_id": "47041986",
    518         "title": "A Survey of In-Context Reinforcement Learning",
    519         "points": 2,
    520         "comments": 0,
    521         "url": "https://news.ycombinator.com/item?id=47041986",
    522         "created_at": "2026-02-17T00:01:18Z"
    523       },
    524       {
    525         "hn_id": "43296207",
    526         "title": "The Widespread Adoption of Large Language Model-Assisted Writing Across Society",
    527         "points": 2,
    528         "comments": 0,
    529         "url": "https://news.ycombinator.com/item?id=43296207",
    530         "created_at": "2025-03-08T00:09:53Z"
    531       },
    532       {
    533         "hn_id": "43088092",
    534         "title": "The Widespread Adoption of Large Language Model-Assisted Writing Across Society",
    535         "points": 2,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=43088092",
    538         "created_at": "2025-02-18T10:30:59Z"
    539       },
    540       {
    541         "hn_id": "38091292",
    542         "title": "Communicative Agents for Software Development",
    543         "points": 2,
    544         "comments": 0,
    545         "url": "https://news.ycombinator.com/item?id=38091292",
    546         "created_at": "2023-10-31T21:00:02Z"
    547       },
    548       {
    549         "hn_id": "37786498",
    550         "title": "Communicative Agents for Software Development",
    551         "points": 2,
    552         "comments": 0,
    553         "url": "https://news.ycombinator.com/item?id=37786498",
    554         "created_at": "2023-10-06T02:13:18Z"
    555       },
    556       {
    557         "hn_id": "46452714",
    558         "title": "Performance Evaluation of Brokerless Messaging Libraries",
    559         "points": 1,
    560         "comments": 0,
    561         "url": "https://news.ycombinator.com/item?id=46452714",
    562         "created_at": "2026-01-01T09:44:55Z"
    563       },
    564       {
    565         "hn_id": "45486277",
    566         "title": "Brain Graph Augmentation via Learnable Edge Masking for Psychiatric Diagnosis",
    567         "points": 1,
    568         "comments": 0,
    569         "url": "https://news.ycombinator.com/item?id=45486277",
    570         "created_at": "2025-10-05T23:45:56Z"
    571       }
    572     ],
    573     "top_points": 27,
    574     "total_points": 44,
    575     "total_comments": 0
    576   }
    577 }

Impressum · Datenschutz