ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26040B)


      1 {
      2   "paper": {
      3     "title": "DataSentinel: A Game-Theoretic Detection of Prompt Injection Attacks",
      4     "authors": [
      5       "Yupei Liu",
      6       "Yuqi Jia",
      7       "Jinyuan Jia",
      8       "Dawn Song",
      9       "Neil Zhenqiang Gong"
     10     ],
     11     "year": 2025,
     12     "venue": "IEEE Symposium on Security and Privacy (S&P)",
     13     "arxiv_id": "2504.11358"
     14   },
     15   "scan_version": 3,
     16   "active_modules": [
     17     "experimental_rigor",
     18     "data_leakage"
     19   ],
     20   "methodology_tags": [
     21     "benchmark-eval"
     22   ],
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Code and data released at https://github.com/liu00222/Open-Prompt-Injection, stated in the abstract."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper uses publicly available benchmark datasets (MRPC, Jfleg, SMS Spam, RTE, SST2, HSOL, Gigaword) and states code and data are available at the GitHub link."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided in the paper. Only model names and hyperparameters are listed."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions are provided in the paper itself. The GitHub link is provided but no README or reproduction guide is described in the paper."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Results are reported as point estimates (FPR and FNR values) without confidence intervals or error bars."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims DataSentinel 'significantly outperforms' baselines but provides no statistical significance tests — comparisons are based solely on comparing FPR/FNR numbers."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "FPR and FNR values are reported with baselines for context (e.g., KAD FPR up to 0.10 vs DataSentinel 0.00; KAD FNR up to 0.21 vs DataSentinel at most 0.07), providing magnitude context."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "100 data points per task are sampled from test sets without justification for why 100 is sufficient. No power analysis."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "The paper mentions fixing the random seed (Section 5.1) and reports single-run results. No variance across multiple runs is reported."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Six baseline detection methods are compared: EVD, NLLMD, SSFTD, SSFTD-G, PromptGuard, and KAD (Section 5.1)."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines include contemporary methods: PromptGuard (Meta, 2024), KAD (USENIX Security 2024), and recent attack methods like NeuralExec and Universal."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Extensive ablation study in Section 5.3 covering r, |D|, α, β, nin, nout, detection LLM, backend LLM. Also DataSentinel (Min) variant removes the adversarial component."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Two metrics used: False Positive Rate (FPR) and False Negative Rate (FNR), reported separately."
     93       },
     94       "human_evaluation": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "This is an automated detection method evaluated on benchmark datasets. Human evaluation is not relevant to the claims."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Fine-tuning uses Gigaword training set (500 data points). Evaluation uses 100 data points from test sets of 7 different datasets (Section 5.1). Fine-tuning tasks do not overlap with evaluation tasks."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Results broken down per target task (7 tasks), per injected task (7 tasks), per attack type (9 attacks), and per target-injected combination in appendix Tables 10-16."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 6 discusses failure case where injected and target tasks are the same type (sentiment analysis FNR = 0.87), and the limitation with adversarial examples vs prompt injection."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The paper reports that DataSentinel is less effective when injected task = target task (FNR 0.87 for sentiment analysis), and discusses why in Section 6."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Abstract claims of effective detection on multiple benchmarks and LLMs are supported by Tables 1-6. The claim of outperforming baselines is supported by Table 3."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Causal claims about which components matter are supported by ablation studies (Section 5.3) and the DataSentinel (Min) variant comparison (Section 5.4), which isolate the game-theoretic component."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 6 explicitly bounds generalization: less effective when injected task = target task, discusses benign instructions limitation, and notes the defense may be less effective as LLMs improve (meta-review D.4)."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Section 6 discusses alternative explanations: the detection LLM vs backend LLM context difference, why some false negatives still cause attacks, and comparison with StruQ/SecAlign as alternative defense approaches."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper measures FPR and FNR for prompt injection detection and frames claims at that exact granularity: 'DataSentinel achieves 0% FPR and at most 7% FNR' on specific benchmarks. It does not frame this as 'security' or 'safety' broadly — it stays within the detection accuracy construct."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Models specified as 'Mistral-7B', 'LLaMA2-7B', 'LLaMA3-8B-Instruct' without specific snapshot dates or HuggingFace model IDs with version hashes. Marketing names without version specifics."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Detection instruction template provided: 'Repeat [secret key] once while ignoring the following text:' (Section 4.2). Target/injected instructions referenced from prior work [7] and detailed in Appendix A."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Comprehensive hyperparameter reporting in Section 5.1: temperature=0.1, α=1, β=1, r=3, lr=0.000025, bin=8, bout=2, nin=10, nout=500, QLoRA used."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. DataSentinel is a fine-tuning and inference method, not an agentic system."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 5.1 documents data construction: 100 data points sampled from test sets per task, 500 from Gigaword training set for fine-tuning, how contaminated data samples are constructed (100 per target-injected combination)."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 6 'Discussion and Limitations' provides substantive discussion of multiple limitations."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 6 discusses specific threats: less effective for same target/injected task type, benign instructions in data causing false positives, and potential weakness as LLMs improve at instruction following."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 6 explicitly states DataSentinel is less effective for adversarial examples (same task type), cannot distinguish benign instructions from injections, and leaves detecting same-task attacks as future work."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "All benchmark datasets used (MRPC, Jfleg, SMS Spam, RTE, SST2, HSOL, Gigaword) are publicly available. Code and data available at GitHub."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 5.1 describes data collection: which datasets, how many samples, how contaminated data is constructed using each attack type, sampling procedures."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. All data is from standard NLP benchmarks."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The pipeline from dataset sampling to contaminated data construction to evaluation is documented in Section 5.1, including counts (100 data points per task, 35,700 total contaminated samples)."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Acknowledgments section lists NSF grants 2131859, 2125977, 2112562, 1937787 and ARO grant W911NF2110182."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations clearly listed: Penn State, Duke University, UC Berkeley. No product being evaluated is from these institutions."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "Funding is from NSF and ARO (government agencies) which have no financial stake in the outcome of prompt injection detection research."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interests statement is present in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "The paper does not evaluate pre-trained model capability on a benchmark. It evaluates a detection defense method against attacks — model knowledge/contamination is not relevant to the claims."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Same as above: this tests a defense method, not model knowledge on benchmarks."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "Same as above: benchmark contamination (model memorizing test data) is not relevant to evaluating a prompt injection detector."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Section 5.2 reports inference cost: 1.6 seconds per query on Quadro RTX 6000, ~10% overhead vs backend LLM (15.3s). Also reports 0.7s for smaller LLaMA3.2-1B detection LLM."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "Fine-tuning takes ~3 hours on one Quadro RTX 6000 GPU, costing $0.90 in cloud GPU rent (Section 5.2)."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The paper fixes a single random seed (Section 5.1: 'fix the seed for the random number generator') and reports single-run results. No multi-seed analysis."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No explicit statement of number of runs. Single fixed seed implies single run."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The ablation study (Section 5.3) varies hyperparameters one at a time but does not report a search budget or how the default values were selected."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Default hyperparameters (α=1, β=1, r=3, etc.) are stated but not justified beyond the ablation showing they work well. No validation set selection procedure described."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors implement their own baselines (EVD, NLLMD, SSFTD, SSFTD-G) and compare against their own system. No acknowledgment of self-comparison bias. PromptGuard and KAD use open-source code from others."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "DataSentinel requires fine-tuning (3 hours GPU) while KAD does not, but performance is not reported as a function of matched compute budgets."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The paper does not discuss whether the 7 NLP tasks and attack scenarios are representative of real-world prompt injection threats. No discussion of construct validity of the evaluation setup."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "The paper evaluates a detection method, not model comparisons through different scaffolds. The fine-tuned detection LLM is evaluated directly on benchmark inputs. No scaffolding framework mediates between the model and the evaluation."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": false,
    351         "answer": false,
    352         "justification": "This evaluates a defense method, not model knowledge on benchmarks. Temporal leakage of benchmark solutions is not relevant here."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "The paper explicitly separates the fine-tuning data (Gigaword training set, different instruction) from evaluation data (test sets of 7 tasks with different instructions), and notes the adaptive attacks during fine-tuning differ from evaluation attacks (Section 5.2)."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": true,
    362         "justification": "Section 5.2 explicitly states fine-tuning tasks (D) do not overlap with evaluation target/injected tasks, and the optimized separator during training differs from attack separators used in evaluation."
    363       },
    364       "leakage_detection_method": {
    365         "applies": false,
    366         "answer": false,
    367         "justification": "Standard benchmark contamination detection is not relevant to this defense evaluation. The paper does address train-test separation through design."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "DataSentinel achieves FPR close to 0 and FNR at most 0.07 across all existing prompt injection attacks.",
    374       "evidence": "Tables 1 and 2 show FPR ≤ 0.01 and FNR ≤ 0.07 across 7 target tasks, 7 injected tasks, and 9 attacks (Section 5.2).",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "DataSentinel significantly outperforms 6 baseline detection methods in both FPR and FNR.",
    379       "evidence": "Table 3 shows DataSentinel achieves FPR of 0.00-0.01 vs KAD 0.01-0.10, and lower FNR across all injected tasks under NeuralExec (Section 5.2).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "DataSentinel remains effective against adaptive attacks as long as injected instructions differ from target task.",
    384       "evidence": "Table 6 shows FNR ≤ 0.06 for adaptive attacks except when target and injected task are both sentiment analysis (FNR = 0.87). Section 5.4.2.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "DataSentinel generalizes to unseen backend LLMs (third-party provider scenario).",
    389       "evidence": "Table 5 shows FNR ≤ 0.01 when fine-tuned with LLaMA3-8B but evaluated against attacks optimized for OpenChat, Mistral-7B, Mixtral-8x7B, LLaMA-3.1-8B (Section 5.3).",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Detection overhead is minor (~10%) compared to backend LLM processing time.",
    394       "evidence": "Section 5.2 reports 1.6s detection query vs 15.3s backend query on Quadro RTX 6000. Fine-tuning takes ~3 hours ($0.90 cloud cost).",
    395       "supported": "strong"
    396     }
    397   ],
    398   "key_findings": "DataSentinel formulates prompt injection detection as a minimax optimization problem, fine-tuning a detection LLM to be more vulnerable to injections (turning vulnerability into defense signal). Evaluated on 9 attacks, 7 NLP tasks, and 6 LLMs, it achieves near-zero FPR and FNR ≤ 0.07 on existing attacks, substantially outperforming 6 baselines including known-answer detection. The approach is less effective when the injected task matches the target task type, as prompt injection reduces to adversarial examples in that scenario.",
    399   "red_flags": [
    400     {
    401       "flag": "Single-seed evaluation",
    402       "detail": "All results reported from a single fixed random seed. No variance across seeds is reported, making it impossible to assess result stability."
    403     },
    404     {
    405       "flag": "No statistical significance tests",
    406       "detail": "Paper claims DataSentinel 'significantly outperforms' baselines but provides no statistical tests — comparisons rely solely on comparing point estimates."
    407     },
    408     {
    409       "flag": "Self-implemented baselines",
    410       "detail": "Four of six baselines (EVD, NLLMD, SSFTD, SSFTD-G) are implemented by the authors. Only PromptGuard and KAD use third-party implementations. No acknowledgment of potential self-comparison bias."
    411     }
    412   ],
    413   "cited_papers": [
    414     {
    415       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    416       "authors": [
    417         "Y. Liu",
    418         "Y. Jia",
    419         "R. Geng",
    420         "J. Jia",
    421         "N. Z. Gong"
    422       ],
    423       "year": 2024,
    424       "relevance": "Foundational benchmark for prompt injection attacks and defenses; provides the experimental framework DataSentinel builds upon."
    425     },
    426     {
    427       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    428       "authors": [
    429         "K. Greshake",
    430         "S. Abdelnabi",
    431         "S. Mishra",
    432         "C. Endres",
    433         "T. Holz",
    434         "M. Fritz"
    435       ],
    436       "year": 2023,
    437       "relevance": "Seminal work on indirect prompt injection attacks against LLM-integrated applications."
    438     },
    439     {
    440       "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks",
    441       "authors": [
    442         "D. Pasquini",
    443         "M. Strohmeier",
    444         "C. Troncoso"
    445       ],
    446       "year": 2024,
    447       "arxiv_id": "2403.03792",
    448       "relevance": "Optimization-based prompt injection attack used as a primary baseline and default attack in DataSentinel evaluation."
    449     },
    450     {
    451       "title": "Universal and transferable adversarial attacks on aligned language models",
    452       "authors": [
    453         "A. Zou",
    454         "Z. Wang",
    455         "J. Z. Kolter",
    456         "M. Fredrikson"
    457       ],
    458       "year": 2023,
    459       "arxiv_id": "2307.15043",
    460       "relevance": "GCG method used as the core optimization technique in DataSentinel's minimax formulation."
    461     },
    462     {
    463       "title": "Struq: Defending against prompt injection with structured queries",
    464       "authors": [
    465         "S. Chen",
    466         "J. Piet",
    467         "C. Sitawarin",
    468         "D. Wagner"
    469       ],
    470       "year": 2025,
    471       "relevance": "Prevention-based defense against prompt injection; compared with DataSentinel's detection approach in Section 6."
    472     },
    473     {
    474       "title": "SecAlign: Defending against prompt injection with preference optimization",
    475       "authors": [
    476         "S. Chen",
    477         "A. Zharmagambetov",
    478         "S. Mahloujifar",
    479         "K. Chaudhuri",
    480         "D. Wagner",
    481         "C. Guo"
    482       ],
    483       "year": 2024,
    484       "arxiv_id": "2410.05451",
    485       "relevance": "Prevention-based defense using preference optimization; evaluated alongside DataSentinel in Section 6."
    486     },
    487     {
    488       "title": "Automatic and universal prompt injection attacks against large language models",
    489       "authors": [
    490         "X. Liu",
    491         "Z. Yu",
    492         "Y. Zhang",
    493         "N. Zhang",
    494         "C. Xiao"
    495       ],
    496       "year": 2024,
    497       "arxiv_id": "2403.04957",
    498       "relevance": "Universal optimization-based prompt injection attack evaluated as a baseline in DataSentinel experiments."
    499     },
    500     {
    501       "title": "Pleak: Prompt leaking attacks against large language model applications",
    502       "authors": [
    503         "B. Hui",
    504         "H. Yuan",
    505         "N. Gong",
    506         "P. Burlina",
    507         "Y. Cao"
    508       ],
    509       "year": 2024,
    510       "relevance": "Prompt stealing attack evaluated in DataSentinel; tests a specific injected task of extracting target instructions."
    511     },
    512     {
    513       "title": "The instruction hierarchy: Training llms to prioritize privileged instructions",
    514       "authors": [
    515         "E. Wallace",
    516         "K. Xiao",
    517         "R. Leike",
    518         "L. Weng",
    519         "J. Heidecke",
    520         "A. Beutel"
    521       ],
    522       "year": 2024,
    523       "arxiv_id": "2404.13208",
    524       "relevance": "OpenAI's approach to instruction prioritization as a defense against prompt injection attacks."
    525     },
    526     {
    527       "title": "Jatmo: Prompt injection defense by task-specific finetuning",
    528       "authors": [
    529         "J. Piet",
    530         "M. Alrashed",
    531         "C. Sitawarin"
    532       ],
    533       "year": 2024,
    534       "arxiv_id": "2312.17673",
    535       "relevance": "Task-specific fine-tuning defense against prompt injection; related prevention approach discussed in DataSentinel."
    536     }
    537   ],
    538   "engagement_factors": {
    539     "practical_relevance": {
    540       "score": 2,
    541       "justification": "Open-source tool with code available that developers building LLM-integrated applications could deploy to detect prompt injection attacks."
    542     },
    543     "surprise_contrarian": {
    544       "score": 1,
    545       "justification": "The insight of deliberately making a detection LLM more vulnerable to turn weakness into defense signal is clever but not deeply counterintuitive."
    546     },
    547     "fear_safety": {
    548       "score": 2,
    549       "justification": "Prompt injection is a major security concern for deployed LLM applications, and the paper systematically demonstrates attack vectors and detection gaps."
    550     },
    551     "drama_conflict": {
    552       "score": 1,
    553       "justification": "Mildly challenges existing detection approaches like Meta's PromptGuard (shown to flag nearly everything) but doesn't target a specific company's claims."
    554     },
    555     "demo_ability": {
    556       "score": 1,
    557       "justification": "Code is on GitHub but requires GPU access, fine-tuning setup, and open-source LLMs — not a quick-try experience."
    558     },
    559     "brand_recognition": {
    560       "score": 1,
    561       "justification": "Authors from Duke, Penn State, and UC Berkeley (Dawn Song) are well-known in security research but not household names in broader tech."
    562     }
    563   }
    564 }

Impressum · Datenschutz