ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29143B)


      1 {
      2   "paper": {
      3     "title": "Prompt Injection Attacks in Defended Systems",
      4     "authors": ["Daniil Khomsky", "Narek Maloyan", "Bulat Nutfullin"],
      5     "year": 2024,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2406.14048",
      8     "doi": "10.1007/978-3-031-80853-1_30"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "case-study"],
     13   "key_findings": "The paper investigates prompt injection attacks and defenses in the SaTML 2024 CTF competition, achieving 8th in attack and 9th in defense. Code-based attacks (e.g., asking the model to execute scripts or populate data structures with its secret) were the most effective category, particularly against GPT-3.5. The paper demonstrates that combining multiple basic attack strategies (distraction, teacher, system prompt, word-splitting, code-based) can bypass three-tiered defenses (system prompt, Python filter, LLM filter). GPT-3.5 was more susceptible to complex encoded attacks while Llama-2 leaked more system prompt information but struggled with encryption tasks.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper. The Python filter code is shown inline but no reproducible codebase is released."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper references the competition dataset released by organizers at https://huggingface.co/datasets/ethz-spylab/ctf-satml24 (Section III-F). This publicly available dataset contains the attacks, defenses, and secrets used by participants."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, dependency lists, library versions, or setup instructions are provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are included. The paper describes attack and defense strategies but provides no guide to replicate the experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Section V claims 'Confidence intervals and significance tests were used to assess the robustness of the results' but no actual confidence intervals or error bars appear in the tables or text. Tables I and II show only point estimates."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Section V claims significance tests were used but none are actually shown — no p-values, test statistics, or test names appear anywhere in the paper."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Only raw competition scores are reported (e.g., total attack score 3428 vs best team 9125). No formal effect sizes, relative improvements with context, or standardized measures are provided."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for sample size. The number of defenses attacked, total number of competing teams, or power considerations are not discussed."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures reported. All results are single-value competition scores."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Tables I and II compare the authors' results against the top 3 teams for both defense and attack, providing competitive baselines from the competition."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The baselines are other teams from the same SaTML 2024 CTF competition, making them contemporary by definition."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No ablation study is performed. The paper describes multiple attack categories (distraction, teacher, system prompt, word-splitting, code-based) but never systematically measures the contribution of individual components or combinations."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports both defense metrics (vD normalization coefficient, Table I) and attack metrics (cumulative scoreD, Table II), and separates results by model (Llama-2 vs GPT-3.5)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of attack quality or defense effectiveness is performed. All evaluation is through the automated competition scoring system."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The competition structure separates the Reconnaissance sub-phase (exploration) from the Evaluation sub-phase (actual attack attempts with scores recorded), as described in Section III. This provides a structural separation between development and evaluation."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by model (Llama-2 vs GPT-3.5) in Tables I and II. Attack methods are categorized into distinct groups (distraction, teacher, system prompt, word-splitting, code-based, combined) with qualitative analysis per category."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses where attacks failed: e.g., GPT-3.5 does not output its system prompt upon direct request (Section IV-A-3), Llama-2 cannot handle ASCII encoding or decryption tasks (Section IV-C), and certain code-based attacks fail against Python filters that detect secrets in output (Section IV-B)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Several negative results are reported: the LLM filter was not used because it failed utility metrics (Section IV-B), ASCII code attacks fail when the model explains its work alongside the output (Section IV-C), and the initial defense had 'obvious flaws' discovered during the attack phase (Section IV-B)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract's claims are modest: it states the paper 'investigates methods for black-box attacks' and 'presents a methodology for vulnerability detection and defensive strategies.' The paper delivers on these through the CTF participation and detailed attack/defense descriptions."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes implicit causal claims such as 'the Python filter helps against attacks' and certain attack combinations 'successfully retrieved secret values' without controlled experimental design to isolate causal factors. The competition setting doesn't control for confounds between attack components."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title 'Prompt Injection Attacks in Defended Systems' implies generality beyond what was tested. Results are specific to GPT-3.5 and Llama-2 in the SaTML 2024 CTF with 512-character-limited defenses and 6-character random secrets, but the paper does not bound its claims to these specific conditions."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No alternative explanations are discussed for why certain attacks succeed or fail. For example, the paper doesn't consider whether attack success depends on model size, training data, or RLHF alignment strength rather than the attack strategy itself."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures secret extraction success, which directly corresponds to its claim about attack effectiveness. The CTF scoring formula (Section III-G) explicitly defines what success means, and the paper doesn't overclaim beyond this measurement."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper refers to 'Chat-GPT 3.5' and 'Llama 2' without specific versions, snapshot dates, or API versions (e.g., no 'gpt-3.5-turbo-0613' or 'Llama-2-7b-chat')."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Extensive actual prompt text is provided throughout: the system defense prompt (Section III-H), the full Python filter code (Section IV-B), and numerous actual attack prompts with both user inputs and model responses (Sections IV-A and IV-C)."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No hyperparameters are reported — no temperature, top-p, max tokens, or other generation settings for either GPT-3.5 or Llama-2."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The three-tiered defense architecture (system prompt, Python filter, LLM filter) is described in detail in Section III, including filter ordering, character limits, input/output format, and the Debug defense mode. Figure 1 shows the architecture diagram."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "No data preprocessing steps are documented. The paper does not describe how competition data was processed for analysis or how attack/defense pairs were selected for reporting."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No dedicated limitations or threats-to-validity section exists. The conclusion contains only a brief forward-looking statement about 'ongoing need to enhance AI security' without substantive self-criticism."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No specific threats to validity are discussed. The paper does not address potential issues such as the artificial nature of the CTF setting, the limited model selection, or the non-representativeness of 6-character random secrets."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit scope boundaries are stated. The paper does not clarify what its results do NOT show — e.g., whether findings transfer to production systems, other models, longer secrets, or non-English attacks."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The competition dataset containing attacks and defenses was released by the organizers at huggingface.co/datasets/ethz-spylab/ctf-satml24 (Section III-F), allowing independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section III describes the CTF competition structure in detail: two phases (Defense and Attack), the Attack phase split into Reconnaissance and Evaluation, the scoring formula, and how secrets were generated (random alphanumeric sequence of 6 characters)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "No description of how CTF participants were recruited, how many teams participated in total, or whether the participant pool introduces selection bias (e.g., only security-focused researchers)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No documentation of the pipeline from raw competition interactions to the analyzed results presented in the paper. The paper does not explain how specific examples were selected or how the tables were compiled from the competition data."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants, sponsors, or funding agencies."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "Only a personal email address (homdanil123@gmail.com) is provided. No institutional affiliations are listed for any of the three authors."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information is disclosed at all, making it impossible to assess funder independence."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement or financial disclosure appears in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper tests attack and defense strategies against LLMs in a CTF competition, not model knowledge on benchmarks. The secrets are randomly generated per defense, so training data contamination is not a concern."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "The paper tests defense/attack strategies, not model capability on benchmarks. Secrets are freshly generated random strings, so train/test overlap is structurally impossible."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "The CTF uses randomly generated secrets and custom defenses, not pre-existing benchmarks. Contamination in the traditional sense does not apply to this evaluation."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human subjects study. The paper reports results from a CTF competition where participants are adversarial researchers, not study subjects."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human subjects study — the paper analyzes attack/defense strategies in a competition setting."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human subjects study. Competition participant demographics are not relevant as human subjects data."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human subjects study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human subjects experimental study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human subjects experimental study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human subjects study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "Section VI states the approach is 'cheap in terms of resources spent' because it 'does not involve model training' but provides no quantification of API costs, tokens consumed, or wall-clock time."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No computational budget is stated — no API costs, GPU hours, or total resource consumption for the attack or defense experiments."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No seed sensitivity analysis. Results are from a single competition run with no repeated experiments across different random seeds."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is not stated. While the scoring formula includes X (number of chats created), the paper does not report how many attempts were made per defense."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. The paper does not describe how attack prompts or defense configurations were tuned or how many variations were tried."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No justification for why the final attack/defense configurations were selected. The paper presents the chosen strategies without describing the selection process or reporting alternative configurations tried."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests are actually shown despite being claimed in Section V, so no multiple comparison correction could have been applied."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors evaluate their own attack and defense methods in the competition but do not acknowledge the bias of self-evaluation or discuss how this might affect their interpretation of results."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No analysis of performance as a function of compute budget. The scoring formula penalizes more attempts (chats) but the paper does not analyze how resource expenditure relates to attack success."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the CTF competition format (protecting 6-character random secrets with 512-character-limited defenses) actually measures real-world LLM security properties. The ecological validity of the benchmark is not questioned."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "The defense scaffolding (system prompt + Python filter + LLM filter) IS the thing being tested. The paper evaluates bundled defense systems, not isolated model capabilities, so the scaffold confound does not apply."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage. While the CTF uses random secrets (mitigating direct leakage), the paper does not discuss whether models may have seen similar attack patterns or defense strategies in training data."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of feature leakage. The paper does not consider whether the competition setup leaks information (e.g., the system prompt structure being known to attackers provides privileged information)."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of non-independence. Multiple attacks against the same defense are not independent, and the paper does not address whether correlation between defense strategies affects the analysis."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is described or applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Despite advanced three-tiered defenses (system prompt, Python filter, LLM filter), significant security gaps remain exploitable through prompt injection attacks on LLMs.",
    365       "evidence": "Competition results showing the authors' defense ranked 9th (Table I) with multiple teams breaking it, and their attacks ranked 8th (Table II) demonstrating they could break many defenses. Section IV-C shows specific combined attacks that bypass all three defense layers.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Code-based attacks are the most promising attack category against defended LLMs.",
    370       "evidence": "Section IV-A-5 describes code-based attacks as 'the largest and most promising group' with multiple working examples (Python variable initialization, JSON object creation, script execution). Section IV-C shows the code-based attack with ASCII encoding is 'one of the most successful' combined approaches.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "GPT-3.5 is more susceptible to complex encoded attacks while Llama-2 provides more information about system prompts but cannot handle complex encryption tasks.",
    375       "evidence": "Section IV-C states 'the GPT-3.5 model is much higher quality than the Llama-2 model' for attack purposes, noting Llama-2 'could not cope with ASCII encoding or other encryption and decryption tasks.' Section IV-A-3 shows Llama-2 outputs its system prompt on direct request while GPT-3.5 does not.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Confidence intervals and significance tests confirmed the effectiveness of the proposed methods.",
    380       "evidence": "Section V states 'Confidence intervals and significance tests were used to assess the robustness of the results. The detailed statistical analyses confirmed the effectiveness' but no actual statistical results, p-values, or confidence intervals are shown anywhere in the paper.",
    381       "supported": "unsupported"
    382     },
    383     {
    384       "claim": "The attack and defense architecture is resource-efficient because it does not require model training or large computing resources.",
    385       "evidence": "Section VI states the approach is 'cheap in terms of resources spent, because it does not involve model training, the presence of a large amount of marked data and large computing resources.' No quantitative cost data is provided.",
    386       "supported": "weak"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Phantom statistical analyses",
    392       "detail": "Section V explicitly claims 'Confidence intervals and significance tests were used to assess the robustness of the results' but no actual statistical results appear anywhere in the paper — no p-values, no confidence intervals, no test statistics. This is a serious credibility concern."
    393     },
    394     {
    395       "flag": "Missing competition context",
    396       "detail": "The paper reports ranking 8th in attack and 9th in defense but never states the total number of participating teams, making it impossible to assess whether these rankings represent strong or weak performance."
    397     },
    398     {
    399       "flag": "No model version specificity",
    400       "detail": "Only 'Chat-GPT 3.5' and 'Llama 2' are specified without versions, sizes, or API dates. Model behavior can vary significantly across versions."
    401     },
    402     {
    403       "flag": "No limitations section",
    404       "detail": "The paper lacks any formal discussion of limitations, threats to validity, or scope boundaries despite making broad claims about LLM security."
    405     },
    406     {
    407       "flag": "Missing author affiliations",
    408       "detail": "Only a personal Gmail address is provided. No institutional affiliations are disclosed for any of the three authors."
    409     },
    410     {
    411       "flag": "Qualitative rather than quantitative attack comparison",
    412       "detail": "Attack categories (distraction, teacher, system prompt, word-splitting, code-based) are described with examples but never quantitatively compared — no success rates per category, no breakdown of which attacks worked against which defenses."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "Jailbreak and Guard Aligned Language Models with Only Few In-Context Demonstrations",
    418       "authors": ["Zeming Wei", "Yifei Wang", "Yisen Wang"],
    419       "year": 2023,
    420       "arxiv_id": "2310.06387",
    421       "relevance": "Introduces contextual attack and defense concepts for LLM jailbreaking, directly relevant to prompt injection security research."
    422     },
    423     {
    424       "title": "Multi-step Jailbreaking Privacy Attacks on ChatGPT",
    425       "authors": ["Haoran Li", "Dadi Guo", "Wei Fan", "Mingshi Xu", "Jie Huang", "Fanpu Meng", "Yangqiu Song"],
    426       "year": 2023,
    427       "arxiv_id": "2304.05197",
    428       "relevance": "Investigates multi-step privacy attacks on ChatGPT, directly relevant to LLM security and prompt injection research."
    429     },
    430     {
    431       "title": "Jailbreaking ChatGPT via Prompt Engineering: An Empirical Study",
    432       "authors": ["Yi Liu", "Gelei Deng", "Zhengzi Xu"],
    433       "year": 2024,
    434       "arxiv_id": "2305.13860",
    435       "relevance": "Empirical study of prompt-based jailbreaking methods, core reference for LLM security evaluation methodology."
    436     },
    437     {
    438       "title": "Defending ChatGPT against Jailbreak Attack via Self-Reminder",
    439       "authors": ["Fangzhao Wu", "Yueqi Xie", "Jingwei Yi"],
    440       "year": 2023,
    441       "relevance": "Proposes self-reminder defense mechanism against jailbreaks, directly relevant to LLM defense strategies."
    442     },
    443     {
    444       "title": "Bergeron: Combating Adversarial Attacks through a Conscience-Based Alignment Framework",
    445       "authors": ["Matthew Pisano", "Peter Ly", "Abraham Sanders"],
    446       "year": 2024,
    447       "arxiv_id": "2312.00029",
    448       "relevance": "Proposes auxiliary LLM defense framework against adversarial attacks, relevant to multi-tier LLM defense architectures."
    449     },
    450     {
    451       "title": "Certifying LLM Safety against Adversarial Prompting",
    452       "authors": ["Aounon Kumar", "Chirag Agarwal", "Suraj Srinivas"],
    453       "year": 2024,
    454       "arxiv_id": "2309.02705",
    455       "relevance": "Proposes certified defenses against adversarial prompts using input permutation, relevant to prompt injection defense research."
    456     },
    457     {
    458       "title": "Low-Resource Languages Jailbreak GPT-4",
    459       "authors": ["Zheng-Xin Yong", "Cristina Menghini", "Stephen H. Bach"],
    460       "year": 2023,
    461       "arxiv_id": "2310.02446",
    462       "relevance": "Demonstrates jailbreaking via translation to low-resource languages, raising safety concerns for multilingual LLM deployment."
    463     },
    464     {
    465       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    466       "authors": ["Andy Zou", "Zifan Wang", "J. Zico Kolter", "Matt Fredrikson"],
    467       "year": 2023,
    468       "arxiv_id": "2307.15043",
    469       "relevance": "Introduces the AdvBenchmark dataset and universal adversarial attack methods for aligned LLMs, foundational work in LLM adversarial robustness."
    470     },
    471     {
    472       "title": "Jailbroken: How Does LLM Safety Training Fail?",
    473       "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
    474       "year": 2023,
    475       "relevance": "Analyzes failure modes of LLM safety training including base64 encoding attacks, directly relevant to understanding prompt injection vulnerabilities."
    476     },
    477     {
    478       "title": "Defending Against Alignment-Breaking Attacks via Robustly Aligned LLM",
    479       "authors": ["Bochuan Cao", "Yuanpu Cao", "Lu Lin", "Jinghui Chen"],
    480       "year": 2023,
    481       "arxiv_id": "2309.14348",
    482       "relevance": "Proposes robust alignment defense against jailbreak attacks, relevant to LLM defense methodology."
    483     },
    484     {
    485       "title": "SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks",
    486       "authors": ["Alexander Robey", "Eric Wong", "Hamed Hassani", "George J. Pappas"],
    487       "year": 2023,
    488       "arxiv_id": "2310.03684",
    489       "relevance": "Proposes input perturbation defense against jailbreaking, relevant to practical LLM defense strategies."
    490     },
    491     {
    492       "title": "Survey of Hallucination in Natural Language Generation",
    493       "authors": ["Ziwei Ji", "Nayeon Lee", "Rita Frieske"],
    494       "year": 2023,
    495       "relevance": "Comprehensive survey of LLM hallucination, relevant to understanding LLM reliability and failure modes."
    496     }
    497   ]
    498 }

Impressum · Datenschutz