scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (34153B)
      1 {
      2   "paper": {
      3     "title": "Defensive Prompt Patch: A Robust and Generalizable Defense of Large Language Models against Jailbreak Attacks",
      4     "authors": [
      5       "Chen Xiong",
      6       "Xiangyu Qi",
      7       "Pin-Yu Chen",
      8       "Tsung-Yi Ho"
      9     ],
     10     "year": 2024,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2405.20099",
     13     "doi": "10.48550/arXiv.2405.20099"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "Defensive Prompt Patch (DPP) uses a Hierarchical Genetic Algorithm to optimize a human-readable suffix prompt that reduces jailbreak Attack Success Rate to 3.8% on Llama-2-7B-Chat and 2.0% on Mistral-7B-Instruct-v0.2, while maintaining high utility (Win-Rate of 82.98% and 75.06% respectively). DPP outperforms existing prompt-based defenses (Self-Reminder, Goal Prioritization, RPO) in average ASR under both non-adaptive and adaptive attack settings across 7 jailbreak attack types. The method generalizes across models (Llama-2, Mistral, Vicuna, Llama-3) and unforeseen jailbreak queries, though utility degradation is higher on less-aligned models like Mistral.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper provides a HuggingFace Spaces link (https://huggingface.co/spaces/TrustSafeAI/Defensive-Prompt-Patch-Jailbreak-Defense) in the abstract footnote and an anonymous repository link (https://anonymous.4open.science/r/DPP-23FF/README.md) in Appendix Z."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper uses publicly available datasets: AdvBench harmful behaviors (Sec. 4.1, with GitHub link) and AlpacaDataCleaned (Sec. 4.1, with GitHub link). Both are standard public benchmarks."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Appendix C mentions 'single A800 GPU with 80GB of memory' and lists hyperparameters, but no requirements.txt, Dockerfile, Python version, or library version specifications are provided in the paper."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided in the paper. The algorithms are described in detail and a repository link is given, but the paper itself lacks explicit commands or a reproducibility guide."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "All results across Tables 2-5, 22-29, and appendix tables are reported as point estimates (e.g., '0.038 Average ASR', '82.98 Win-Rate') with no confidence intervals, error bars, or uncertainty measures."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper makes numerous comparative claims (e.g., 'outperforms RPO by 42% for ICA attack') based solely on comparing point estimates. No statistical significance tests (p-values, t-tests, etc.) are applied to any comparison."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper consistently reports absolute ASR values and percentage improvements with baseline context (e.g., 'outperforms the ASR of RPO by 42% for ICA attack, 18% for AutoDAN, and 15% for GCG attack' in Sec. 4.2), providing sufficient context to judge magnitude."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper uses 100 harmful queries from AdvBench and 100 benign queries from Alpaca (Sec. 4.1) without justifying why 100 was chosen or discussing whether this sample size is sufficient for the claims made."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No standard deviations, variance, or spread measures are reported for any experimental results. Table 7 shows 3 initializations with individual results but reports only 'Average' without standard deviation."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper compares against Self-Reminder, Goal Prioritization, RPO (for Llama-2), System Prompt (for Mistral), and DRO (Appendix O). Multiple baselines are included for each model in Tables 2-5."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Baselines include Self-Reminder (2023), Goal Prioritization (2023), RPO (2024), and System Prompt approaches. These are contemporary prompt-based defenses representing the state of the art at the time of writing."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Extensive ablation studies in Appendix B cover: objective function components (Table 6), prefix vs. suffix format (Table 7), different prototype initializations (Tables 9-10), alternative solver RLPrompt vs HGA (Table 11), and sentence-level synonym substitution (Table 12)."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses Attack Success Rate (ASR) for defense effectiveness and Win-Rate (AlpacaEval) for utility degradation, plus Min Over Prompt metric in Appendix Y and Llama-Guard-based ASR in Appendix L."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "All evaluation is automated: keyword-based ASR detection (Appendix I), Llama-Guard as judge (Appendix L), and AlpacaEval Win-Rate using model-based comparison. No human evaluation of defense quality or jailbreak outputs is conducted."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "The main experiments appear to use the same 100 AdvBench queries for both DPP training (Adversarial Dataset, Sec. 4.1) and evaluation. While Sec. 4.3 tests on 'another 100 harmful queries' as unforeseen queries, the primary results in Tables 2-5 do not clearly separate training and test queries."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down by individual attack type (Base64, ICA, AutoDAN, GCG, PAIR, TAP, Catastrophic) across all evaluation tables (Tables 2-5, 22-29), providing per-category performance."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper discusses cases where DPP is less effective: higher utility degradation on Mistral (Sec. 4.3, 'the cost of making Mistral-7B-Instruct-v0.2 robust to jailbreak attacks on utility is more significant'), PAIR/TAP adaptive ASR of 83.7%/84.0% on Mistral (Table 5), and prefix DPP weakness under adaptive attacks (Table 7)."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Ablation results show: No Defense setting (α=0) yields ASR of 16% (Table 6), No Helpful setting (β=0) drops Win-Rate to 65.34% (Table 6), RLPrompt achieves only 47.89 Win-Rate vs HGA's 82.98 (Table 11), and removing synonym substitution increases average ASR from 7% to 15% (Table 12)."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims of 'significant reductions in ASR with negligible impact on utility' are supported by Tables 2 and 4 (3.8% and 2.0% average ASR), and Win-Rate results (82.98% on Llama-2 vs 81.37% without defense)."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper makes causal claims ('DPP reduces ASR') supported by controlled ablation studies (Appendix B) that systematically vary individual components (objective functions, suffix vs prefix, solver, synonym substitution) while holding others constant."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title claims 'Defense of Large Language Models' generally, and the conclusion states 'potential as a universal defensive solution in various LLM models.' However, testing is limited to four 7B-13B open-weight models (Llama-2-7B-Chat, Mistral-7B, Vicuna-13B, Llama-3-8B). No larger models, closed-source models, or fundamentally different architectures are tested."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper does not discuss alternative explanations for why DPP works. Appendix Q offers post-hoc hypotheses about specific DPP words ('defective components', 'thorough') but does not consider confounds like whether the suffix length alone or the added token count rather than semantic content explains the defense effectiveness."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper uses keyword-matching ASR as the primary safety metric without discussing whether keyword detection truly captures harmful output generation. A response could avoid refusal keywords while still being harmful, or trigger refusal keywords while being benign. The Llama-Guard evaluation (Appendix L) partially addresses this but the gap between keyword ASR and actual safety is never discussed."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Specific model versions are stated: 'Llama-2-7B-Chat' (Touvron et al., 2023), 'Mistral-7B-Instruct-v0.2' (Jiang et al., 2023), 'Vicuna-13B-v1.5' (Appendix W), and 'Llama-3-8B-Instruct' (Appendix X). These are precise, versioned model identifiers."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Full prompt text is provided for all DPP variants (Appendix E), all baseline defense prompts (Appendix H including Self-Reminder, System Prompt, Goal Prioritization), and the actual DPP suffixes used for each model."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Detailed hyperparameters are reported in Appendix C: num_steps=100, batch_size=64, num_elites=0.1, crossover_rate=0.5, mutation_rate=0.01, α and β values for each model. Attack hyperparameters are in Appendix F."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. DPP is a suffix prompt appended to queries without any tool use, memory, or multi-step agent workflow."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 4.1 documents data preparation: 'we sampled 100 jailbreak questions' from AdvBench, generated denial responses using Llama-2-7B-Chat to create the Adversarial Dataset; similarly sampled 100 from Alpaca for the Utility Dataset. Jailbreak query generation procedures are detailed in Appendix F."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "A dedicated 'Limitation' section follows the conclusion, discussing computational efficiency, cost of GPT-4 for training, limitations of other baselines, and vulnerability to modification on open-weight models."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The Limitation section identifies specific threats: DPP training is computationally intensive and requires GPT-4 access (~$75/training); DPP 'can be easily removed by malicious actors' on open-weight local models; Self-Reminder's training 'works poorly on Llama-2-7B-Chat' due to its alignment."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The Limitation section explicitly states: 'if users run an open-weight model locally, DPP or any system prompts can be easily removed by malicious actors. Thus, the LLMs will still be vulnerable to the Jailbreak Attacks. Under such context, DPP will not be able to protect the actual safety of the open-weighted model.'"
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "Raw experimental outputs (model responses, per-query ASR decisions) are not released. Only aggregated ASR percentages and Win-Rate values are reported in the tables."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 4.1 describes data collection: 100 harmful behaviors from AdvBench fed into Llama-2-7B-Chat to generate refusal responses; 100 benign queries from Alpaca with answers; jailbreak prompts generated via specific attack methods with referenced GitHub repositories."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. The paper uses standard public benchmarks (AdvBench, Alpaca) and automated evaluation."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The pipeline is documented: sample queries from AdvBench/Alpaca → generate refusal/helpful responses → create adversarial/utility datasets → apply attack methods to generate jailbreak queries (Appendix F details each attack's procedure) → evaluate with keyword matching or Llama-Guard."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The Acknowledgment section states: 'Chen Xiong and Tsung-Yi Ho, from the JC STEM Lab of Intelligent Design Automation, are funded by the Hong Kong Jockey Club Charities Trust.'"
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly listed: CUHK (Xiong, Ho), Princeton University (Qi), IBM Research (Chen). None of the authors are affiliated with the evaluated model providers (Meta, Mistral AI)."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "The Hong Kong Jockey Club Charities Trust is a philanthropic organization with no commercial stake in jailbreak defense results or the evaluated models."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial disclosure statement is present in the paper. While Pin-Yu Chen is at IBM Research, there is no statement about whether IBM has financial interests related to the findings."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "This paper tests defense mechanisms against jailbreak attacks, not model knowledge/capability on benchmarks. The contamination concern (model having seen test data) is not directly relevant to evaluating defense effectiveness."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "This paper evaluates defense mechanisms rather than model capability on benchmarks. Train/test overlap in the benchmark contamination sense is not the relevant concern."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "This paper evaluates jailbreak defenses, not pre-trained model capability on knowledge benchmarks. Benchmark contamination of the underlying model is not the target of evaluation."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study. All evaluation is automated using keyword matching, Llama-Guard, and AlpacaEval."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. The study uses only automated benchmarks and model evaluations."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "The paper states DPP has 'high inference efficiency by simply attaching the DPP to the user query' (Limitation section) but provides no quantitative inference cost, latency measurements, or token overhead analysis."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Appendix C reports: 'single A800 GPU with 80GB of memory', 15.32 seconds computational time per training epoch, 100 epochs per training instance, and approximately $75 total training cost for GPT-4 revisions."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No mention of multiple random seeds for the main experiments. Table 7 shows 3 different initializations but these are different prototype prompts, not random seed variations."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The number of experimental runs for main results is never explicitly stated. It appears results are from single runs of each attack/defense combination."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "While hyperparameters are reported (Appendix C), no search budget is described. The paper does not state how many configurations were tried or how α=1,β=10 (Llama-2) and α=10,β=1 (Mistral) were selected."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Algorithm 5 describes the selection procedure: iteratively optimizing DPP candidates over N data pairs and selecting the best DPP based on highest total score (Eq. 6). The selection criterion (combined defense + utility score) is clearly defined."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper makes numerous comparisons across 7 attack types, multiple models, and multiple baselines without any statistical tests, let alone corrections for multiple comparisons."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors implement and evaluate their own system against their own implementations/configurations of baselines. No acknowledgment of self-comparison bias. RPO uses the authors' released suffix, but other baselines are configured by the paper's authors."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "DPP requires iterative genetic optimization with GPT-4 calls (~$75), while baselines like Self-Reminder and Goal Prioritization are hand-crafted with zero optimization cost. This compute difference is not discussed in relation to performance gains."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The primary ASR metric uses keyword matching (Appendix I) — checking if responses contain strings like 'I'm sorry' or 'I cannot'. The paper does not discuss whether this proxy truly captures harmful vs. safe outputs. The Llama-Guard evaluation (Appendix L) provides an alternative but construct validity of keyword-based ASR is never examined."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is involved. DPP is a suffix prompt, not an agentic system."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether AdvBench queries or Alpaca data were in the training data of Llama-2 or Mistral. Since AdvBench was published in 2023, these queries could appear in later model training data."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the evaluation setup leaks information. The DPP is optimized using log-likelihood scores from the target model (Eq. 4-5), which requires white-box access, but this leakage concern is not discussed."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "The adversarial dataset (100 queries for DPP training) and main evaluation queries appear to be drawn from the same AdvBench pool. Non-independence between training and evaluation data is not discussed, though the 'unforeseen queries' experiment (Sec. 4.3) explicitly uses separate queries."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference, or decontamination procedures are mentioned."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "DPP achieves the lowest average ASR (3.8%) on Llama-2-7B-Chat while maintaining the highest Win-Rate (82.98%) among all defense baselines under non-adaptive attacks.",
    370       "evidence": "Table 2 shows DPP Average ASR of 0.038 vs RPO (0.168), Goal Prioritization (0.100), Self-Reminder (0.063), with Win-Rate of 82.98% vs RPO (79.23%), Goal Prioritization (34.29%), Self-Reminder (64.84%).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "DPP achieves the lowest average adaptive ASR (13.0%) on Llama-2-7B-Chat, outperforming the second-best method by more than 4%.",
    375       "evidence": "Table 3 shows DPP Average Adaptive ASR of 0.130 vs Self-Reminder (0.177), Goal Prioritization (0.247), RPO (0.457).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "DPP generalizes to less-aligned models, reducing Mistral-7B average ASR to 2.0% under non-adaptive attacks.",
    380       "evidence": "Table 4 shows DPP Average ASR of 0.020 on Mistral-7B vs Self-Reminder (0.482), System Prompt (0.527), Goal Prioritization (0.222), though Win-Rate drops to 75.06% (from Mistral baseline of 90.31%).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "DPP is robust against adaptive attacks on Mistral-7B with the lowest average adaptive ASR (46.9%).",
    385       "evidence": "Table 5 shows DPP at 0.469 average adaptive ASR. However, PAIR (0.837) and TAP (0.840) adaptive ASR values are very high, indicating DPP struggles against these specific adaptive attacks on Mistral.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Suffix DPP outperforms Prefix DPP, particularly under adaptive settings with a 42% ASR gap.",
    390       "evidence": "Table 7 shows average GCG Adaptive ASR of 0.15 for Suffix DPP vs 0.57 for Prefix DPP, and average Win-Rate of 76.09% vs 73.05%.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Both defense and utility objectives are necessary for optimal DPP performance.",
    395       "evidence": "Table 6 ablation: No Defense (α=0) gives 16% GCG ASR and 72.85% Win-Rate; No Helpful (β=0) gives 3% GCG ASR but only 65.34% Win-Rate. Full DPP achieves 4% GCG ASR with 82.98% Win-Rate.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "HGA is superior to RLPrompt as the optimization method for DPP.",
    400       "evidence": "Table 11 shows RLPrompt achieves GCG ASR of 0.15 and Win-Rate of 47.89, compared to HGA's GCG ASR of 0.04 and Win-Rate of 82.98 on Llama-2-7B-Chat.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "No error bars or statistical tests",
    407       "detail": "All results across all tables are point estimates with no confidence intervals, standard deviations, or significance tests. Claims of 'outperforming' baselines rest entirely on comparing single numbers without any statistical validation of whether differences are meaningful."
    408     },
    409     {
    410       "flag": "Keyword-based ASR is a crude proxy for safety",
    411       "detail": "The primary evaluation metric (Appendix I) checks whether model responses contain refusal keywords like 'I'm sorry' or 'I cannot'. This can both miss harmful responses that avoid these keywords and falsely flag benign responses. While Llama-Guard evaluation is provided in appendices, the main results use keyword matching."
    412     },
    413     {
    414       "flag": "Potential train-test overlap in main experiments",
    415       "detail": "The adversarial dataset for DPP training uses 100 sampled AdvBench queries, and evaluation also uses 100 AdvBench queries. It is unclear whether these are the same 100 queries. Only the 'unforeseen queries' experiment (Sec. 4.3) explicitly uses a separate sample."
    416     },
    417     {
    418       "flag": "Overclaiming generalization from small model set",
    419       "detail": "The title claims 'Defense of Large Language Models' and the conclusion claims 'universal defensive solution', but testing covers only four 7B-13B open-weight models. No closed-source models, no models above 13B, and no fundamentally different architectures are tested."
    420     },
    421     {
    422       "flag": "High adaptive ASR on Mistral undercuts robustness claims",
    423       "detail": "Table 5 shows DPP achieves PAIR adaptive ASR of 83.7% and TAP adaptive ASR of 84.0% on Mistral. The 46.9% average adaptive ASR (best among baselines) still means nearly half of adaptive attacks succeed, yet the paper frames this as demonstrating 'robustness'."
    424     },
    425     {
    426       "flag": "Asymmetric compute budget between DPP and baselines",
    427       "detail": "DPP requires iterative genetic optimization with GPT-4 calls (~$75 training cost, A800 GPU) while baselines like Self-Reminder and Goal Prioritization are hand-crafted with zero compute. This asymmetry is not discussed when comparing performance."
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "Universal and transferable adversarial attacks on aligned language models",
    433       "authors": ["Andy Zou", "Zifan Wang", "J. Zico Kolter", "Matt Fredrikson"],
    434       "year": 2023,
    435       "arxiv_id": "2307.15043",
    436       "relevance": "Introduces the GCG attack, a foundational gradient-based jailbreak method evaluated in this paper and widely studied in LLM safety research."
    437     },
    438     {
    439       "title": "AutoDAN: Generating stealthy jailbreak prompts on aligned large language models",
    440       "authors": ["Xiaogeng Liu", "Nan Xu", "Muhao Chen", "Chaowei Xiao"],
    441       "year": 2023,
    442       "relevance": "Introduces AutoDAN jailbreak attack using genetic algorithms, which directly inspires DPP's HGA optimization approach."
    443     },
    444     {
    445       "title": "Jailbreaking black box large language models in twenty queries",
    446       "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban", "Hamed Hassani", "George J. Pappas", "Eric Wong"],
    447       "year": 2023,
    448       "relevance": "Introduces the PAIR attack method for automated black-box jailbreaking of LLMs, a key attack evaluated in this paper."
    449     },
    450     {
    451       "title": "Tree of attacks: Jailbreaking black-box LLMs automatically",
    452       "authors": ["Anay Mehrotra", "Manolis Zampetakis", "Paul Kassianik", "Blaine Nelson", "Hyrum Anderson", "Yaron Singer", "Amin Karbasi"],
    453       "year": 2023,
    454       "relevance": "Introduces the TAP attack using tree-of-thought jailbreaking, evaluated as both non-adaptive and adaptive attack in this paper."
    455     },
    456     {
    457       "title": "Defending chatgpt against jailbreak attack via self-reminders",
    458       "authors": ["Yueqi Xie", "Jingwei Yi", "Jiawei Shao", "Justin Curl", "Lingjuan Lyu", "Qifeng Chen", "Xing Xie", "Fangzhao Wu"],
    459       "year": 2023,
    460       "relevance": "Introduces Self-Reminder defense, a primary baseline in this paper and a key prompt-based defense approach."
    461     },
    462     {
    463       "title": "Robust prompt optimization for defending language models against jailbreaking attacks",
    464       "authors": ["Andy Zhou", "Bo Li", "Haohan Wang"],
    465       "year": 2024,
    466       "arxiv_id": "2401.17263",
    467       "relevance": "Introduces RPO defense using optimized prompt suffixes, a primary baseline in this paper for comparison against DPP."
    468     },
    469     {
    470       "title": "Defending large language models against jailbreaking attacks through goal prioritization",
    471       "authors": ["Zhexin Zhang", "Junxiao Yang", "Pei Ke", "Minlie Huang"],
    472       "year": 2023,
    473       "relevance": "Introduces Goal Prioritization defense, a key baseline that sacrifices utility for safety, contrasted with DPP's balanced approach."
    474     },
    475     {
    476       "title": "SmoothLLM: Defending large language models against jailbreaking attacks",
    477       "authors": ["Alexander Robey", "Eric Wong", "Hamed Hassani", "George J. Pappas"],
    478       "year": 2023,
    479       "relevance": "Proposes a perturbation-based defense against jailbreak attacks, representing an alternative defense paradigm to prompt-based approaches."
    480     },
    481     {
    482       "title": "Llama Guard: LLM-based input-output safeguard for human-AI conversations",
    483       "authors": ["Hakan Inan", "Kartikeya Upasani", "Jianfeng Chi"],
    484       "year": 2023,
    485       "relevance": "Used as an alternative LLM-based judge for ASR evaluation in this paper, representing safety classification approaches."
    486     },
    487     {
    488       "title": "JailbreakBench: An open robustness benchmark for jailbreaking large language models",
    489       "authors": ["Patrick Chao", "Edoardo Debenedetti", "Alexander Robey"],
    490       "year": 2024,
    491       "arxiv_id": "2404.01318",
    492       "relevance": "Provides a standardized jailbreak evaluation benchmark used for generalization testing of DPP."
    493     },
    494     {
    495       "title": "Jailbreaking leading safety-aligned LLMs with simple adaptive attacks",
    496       "authors": ["Maksym Andriushchenko", "Francesco Croce", "Nicolas Flammarion"],
    497       "year": 2024,
    498       "arxiv_id": "2404.02151",
    499       "relevance": "Introduces simple adaptive attack methods evaluated against DPP in Appendix N, representing recent advances in jailbreak techniques."
    500     },
    501     {
    502       "title": "Improved few-shot jailbreaking can circumvent aligned language models and their defenses",
    503       "authors": ["Xiaosen Zheng", "Tianyu Pang", "Chao Du", "Qian Liu", "Jing Jiang", "Min Lin"],
    504       "year": 2024,
    505       "arxiv_id": "2406.01288",
    506       "relevance": "Introduces I-FSJ few-shot jailbreak method evaluated against DPP, testing defense robustness against few-shot attack strategies."
    507     },
    508     {
    509       "title": "On prompt-driven safeguarding for large language models",
    510       "authors": ["Chujie Zheng", "Fan Yin", "Hao Zhou", "Fandong Meng", "Jie Zhou", "Kai-Wei Chang", "Minlie Huang", "Nanyun Peng"],
    511       "year": 2024,
    512       "arxiv_id": "2401.18018",
    513       "relevance": "Studies prompt-based safeguarding mechanisms for LLMs including system prompts and DRO, used as a baseline in Appendix O."
    514     },
    515     {
    516       "title": "Baseline defenses for adversarial attacks against aligned language models",
    517       "authors": ["Neel Jain", "Avi Schwarzschild", "Yuxin Wen"],
    518       "year": 2023,
    519       "relevance": "Establishes baseline defenses against adversarial attacks on LLMs, contextualizing the defense landscape DPP operates in."
    520     }
    521   ],
    522   "engagement_factors": {
    523     "practical_relevance": {
    524       "score": 2,
    525       "justification": "DPP is a deployable defense technique — a suffix prompt that can be appended to LLM queries at inference time without model retraining, making it usable by service providers."
    526     },
    527     "surprise_contrarian": {
    528       "score": 1,
    529       "justification": "The finding that a simple optimized suffix can outperform more complex defenses is somewhat surprising, but the general approach of prompt-based defense is well-established."
    530     },
    531     "fear_safety": {
    532       "score": 2,
    533       "justification": "Directly addresses LLM jailbreak attacks and safety vulnerabilities, a topic of active concern in the AI safety community."
    534     },
    535     "drama_conflict": {
    536       "score": 0,
    537       "justification": "No controversial claims, institutional conflicts, or dramatic narrative — a straightforward defense method paper."
    538     },
    539     "demo_ability": {
    540       "score": 2,
    541       "justification": "HuggingFace Spaces demo is available and code repository is released, allowing practitioners to try the defense."
    542     },
    543     "brand_recognition": {
    544       "score": 1,
    545       "justification": "IBM Research co-author (Pin-Yu Chen) is known in adversarial ML, and Princeton affiliation adds visibility, but this is not from a flagship AI lab or about a flagship product."
    546     }
    547   }
    548 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs