ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29699B)


      1 {
      2   "paper": {
      3     "title": "Cognitive Overload Attack: Prompt Injection for Long Context",
      4     "authors": [
      5       "Bibek Upadhayay",
      6       "Vahid Behzadan",
      7       "Amin Karbasi"
      8     ],
      9     "year": 2024,
     10     "venue": "arXiv preprint",
     11     "arxiv_id": "2410.11272"
     12   },
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, code archive, or GitHub link is provided anywhere in the paper. The attack algorithm is described in pseudocode (Algorithm 1) but no executable code is released."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The paper uses subsets of the Forbidden Question Set and JailbreakBench (both publicly available), but does not release its curated subset of 232 questions, derivative questions generated by GPT-3.5-Turbo, or cognitive load prompts. No data download link is provided."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper mentions using various model APIs but does not specify library versions or environment setup details."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions or README are provided. While the methodology is described in detail (Sections 3-4, Algorithm 1), there are no specific commands or scripts to reproduce the experiments."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No confidence intervals or error bars are reported. The attack success rates (Tables 1, 2) and performance scores (Figure 7) are reported as point estimates without any uncertainty quantification."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Section 3.4.2 reports a paired t-test: 't = 3.1248, p = 0.0048' for the decrease in scores across cognitive load levels. Section 3.4.3 also reports 'p<0.05 for both models' for the increase in irrelevant token counts."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Attack success rates are reported with baseline context (e.g., 99.99% for Claude-3-Opus, 90.95% for GPT-4 in Table 1), and performance degradation is shown from CL0 baselines (Figure 7). The ASR values and score drops provide magnitude context comparable to percentage improvements with baselines."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification is given for the sample sizes used (232 questions from Forbidden Question Set, 100 from JailbreakBench, 100 from Vicuna MT Benchmark, 10 for self-reporting). No power analysis is discussed."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance, standard deviation, or spread measures are reported across runs. The paper does not mention running experiments multiple times or reporting any measure of result stability. All results appear to be single-run."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper includes CL0 (no cognitive load) as a baseline condition against which all cognitive load levels are compared. Section 3.4.2 uses pairwise comparison between CL0 and each CL combination. The related work section (Section 6) also acknowledges and compares against Xu et al. (2023), a concurrent work on cognitive overload jailbreaking."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper tests against state-of-the-art models at the time: GPT-4, Claude-3.5-Sonnet, Claude-3-Opus, Llama-3-70B-Instruct, Gemini-1.0-Pro, and Gemini-1.5-Pro. The comparison with Xu et al. (2023) is contemporary."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper progressively adds cognitive load tasks (CL1 through CL6), effectively ablating the contribution of each task component. Section 3.3 (dual-task approach, App. A.2.1) measures the effect of each individual CL task, and the progressive stacking from CL1 to CL6 shows the incremental contribution of each component."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper uses multiple evaluation metrics: Attack Success Rate (ASR) across different datasets (Tables 1, 2), pairwise comparison scores (Figure 7), self-reporting CL scores (Figure 11), token count analysis (Figure 14), and qualitative visual evaluation of code generation output (Figures 2-4)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "The paper explicitly acknowledges the absence of human evaluation in Limitation #8: 'Our work is further limited by the absence of human evaluation to assess responses or derivative questions.' All evaluation is automated through LLM judges."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "There is no clear separation of development and test data. The CL combinations (CL7-CL11) for Claude-3.5-Sonnet were crafted through 'experimental trial and error' (Section 4.3, App. A.8), suggesting the test data was used for tuning the attack. No held-out evaluation is described."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Tables 1 and 2 provide per-model and per-cognitive-load-level breakdowns of attack success rates. Table 3 and 4 provide additional per-judge-model breakdowns. The Forbidden Question Set categories are described (App. A.6.1), though per-harm-category breakdowns of ASR are not provided."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper discusses the failure of CL1-CL6 against Claude-3.5-Sonnet (Section 4.3), smaller models failing from early CL stages (App. A.4), Gemini models failing to generate code from INTcl onwards (Section 3.4.1), and the lower ASR for Gemini-1.0-Pro (75.43% and 49%). Limitation #7 discusses non-harmful derivative questions."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports several negative results: the initial CL combinations (CL1-CL6) completely failed against Claude-3.5-Sonnet (Section 4.3), the Claude-3.5-Sonnet ASR was only 53% (Table 6), Gemini-1.0-Pro had low ASR of 49% on JailbreakBench (Table 2), and Llama Guard showed effectiveness as a partial defense (App. A.7)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The abstract claims 'attack success rates of up to 99.99%' which is technically true for Claude-3-Opus on the Forbidden Question dataset (Table 1), but this cherry-picks the best result. The abstract does not mention the much lower 49% ASR for Gemini-1.0-Pro or the 53% for Claude-3.5-Sonnet. The claim of 'up to' is technically accurate but misleading about the overall effectiveness."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes strong causal claims about cognitive overload causing jailbreaks (e.g., 'cognitive overload—a state where the demand on cognitive processing exceeds the available capacity of the model, leading to potential errors'). However, the analogy between human cognitive load and LLM processing is not empirically validated as causal—the observed performance degradation could have alternative explanations (e.g., attention dilution, context length effects) that do not require invoking cognitive overload as a causal mechanism."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title and framing suggest a general vulnerability in LLMs ('Prompt Injection for Long Context'), but results are from a specific set of 6 models tested with specific CL task combinations. The paper claims broad applicability ('highlighting vulnerabilities in LLMs' ICL') but the attack required completely different prompt designs for Claude-3.5-Sonnet (CL7-CL11), and ASR varied enormously (49-99.99%). These bounds on generalization are not clearly stated in the abstract or conclusion."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper interprets all results through the cognitive load theory lens without seriously considering alternative explanations. The performance degradation with more complex prompts could be explained by simple attention dilution, context window saturation, or instruction-following difficulty rather than 'cognitive overload.' The discussion (Section 5) only discusses competing objectives and mismatched generalization as supporting frameworks, not as alternative explanations to be ruled out."
    130       }
    131     },
    132     "setup_transparency": {
    133       "model_versions_specified": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper lists model names like 'GPT-4', 'Claude-3.5-Sonnet', 'Claude-3-Opus', 'Llama-3-70B-Instruct', 'Gemini-1.0-Pro', and 'Gemini-1.5-Pro' but does not provide specific version identifiers, snapshot dates, or API versions (e.g., 'gpt-4-0613'). Only GPT-3.5-Turbo and GPT-4-Turbo have slightly more specificity but still lack snapshot dates."
    137       },
    138       "prompts_provided": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper provides extensive examples of actual prompts used. Figure 5 shows the observation task obfuscation, Figure 6 shows a complete CL5 prompt, Figure 9 shows the judge evaluation prompt, Figure 16 shows the harmfulness evaluation prompt, and Figures 19, 24, 26-31 in the appendix show complete attack prompts for various CL levels."
    142       },
    143       "hyperparameters_reported": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper specifies temperature=0 for the deterministic setting (Sections 2, 3.2) used in experiments. While other hyperparameters like top-p and max tokens are not mentioned, the temperature setting is the most critical parameter and is explicitly stated."
    147       },
    148       "scaffolding_described": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "The attack pipeline is described in detail: Algorithm 1 (App. A.6.1) describes the automated attack algorithm including the iterative CL escalation, judge evaluation loop, and stopping criteria. The derivative question generation process and the multi-judge evaluation pipeline are also documented."
    152       },
    153       "data_preprocessing_documented": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "App. A.6.1 describes the data preprocessing: removing categories 'Political Lobbying', 'Legal Opinion', 'Financial Advice', 'Health Consultation' from the Forbidden Question Set, excluding 'Economic Harm' and 'Pornography' questions deemed harmless, resulting in 232 questions. The derivative question generation process using GPT-3.5-Turbo is also described."
    157       }
    158     },
    159     "limitations_and_scope": {
    160       "limitations_section_present": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section 9 'Limitations' provides an extensive list of 13 specific limitations of the work, spanning nearly two full pages."
    164       },
    165       "threats_to_validity_specific": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The limitations are specific to this study: Limitation #4 acknowledges the preliminary experiment was limited to a single model (Llama-3-70B-Instruct), #7 discusses non-harmful derivative questions from GPT-3.5-Turbo's safety training, #8 acknowledges the absence of human evaluation, #9 discusses judge LLM bias, #10 discusses the impact of evaluation prompt wording on ASR (SAFE/UNSAFE vs. SAFE/UNSAFE/NEUTRAL)."
    169       },
    170       "scope_boundaries_stated": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The limitations section explicitly states specific scope boundaries: Limitation #1 (limited set of CL tasks and patterns), #2 (CL limited to specific tags), #3 (task order not explored), #6 (self-reporting limited to 10 questions), #11 (only observation task performance assessed, not CL task performance), #12 (cost increases with cognitive load)."
    174       }
    175     },
    176     "data_integrity": {
    177       "raw_data_available": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No raw data is made available. The model responses, judge evaluations, derivative questions, and complete attack prompts are not released for independent verification."
    181       },
    182       "data_collection_described": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The data collection procedure is described: 232 questions from Forbidden Question Set with specific categories removed (App. A.6.1), 100 questions from JailbreakBench, 100 questions from Vicuna MT Benchmark curated for CL experiments (Section 3.4.2), and 100 science experiment questions created using Claude-3.5-Sonnet (App. A.2.1)."
    186       },
    187       "recruitment_methods_described": {
    188         "applies": false,
    189         "answer": false,
    190         "justification": "No human participants were involved in this study. All experiments were conducted with LLM APIs."
    191       },
    192       "data_pipeline_documented": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The pipeline is documented: original harmful questions -> derivative question generation via GPT-3.5-Turbo -> obfuscation with tags -> combination with CL tasks -> attack on target model -> judge evaluation -> classification as SAFE/UNSAFE/NEUTRAL. Algorithm 1 formalizes this. The filtering of Forbidden Question Set categories and counts (232 questions from original set) are specified."
    196       }
    197     },
    198     "conflicts_of_interest": {
    199       "funding_disclosed": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "No funding source or acknowledgments section is present in the paper. There is no mention of any grants, sponsors, or funding agencies."
    203       },
    204       "affiliations_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Author affiliations are clearly listed: Bibek Upadhayay and Vahid Behzadan are affiliated with SAIL Lab, University of New Haven; Amin Karbasi is affiliated with Robust Intelligence, CISCO. The corporate affiliation (CISCO/Robust Intelligence) is disclosed."
    208       },
    209       "funder_independent_of_outcome": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding is disclosed, making independence impossible to assess. One author (Amin Karbasi) is affiliated with Robust Intelligence (CISCO), a company in the AI safety/security space that could benefit from demonstrations of LLM vulnerabilities. This potential conflict is not discussed."
    213       },
    214       "financial_interests_declared": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No competing interests statement or financial disclosure is provided. One author is affiliated with Robust Intelligence/CISCO, which operates in the AI security domain directly relevant to the paper's findings, but no competing interests declaration is present."
    218       }
    219     },
    220     "contamination": {
    221       "training_cutoff_stated": {
    222         "applies": false,
    223         "answer": false,
    224         "justification": "This paper is a red-teaming/jailbreaking study that tests defense bypass rather than evaluating model capability on a benchmark. The attack exploits prompt design, not model knowledge, so training data contamination is not relevant to the claims."
    225       },
    226       "train_test_overlap_discussed": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "Not applicable for the same reason as above: this is a security/jailbreaking study, not a benchmark evaluation of model knowledge. However, the paper does note (Section 4.1) that derivative questions were paraphrased 'to avoid possible contamination of questions during the safety training of the SOTA models,' showing awareness of a related concern."
    230       },
    231       "benchmark_contamination_addressed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Not applicable: the study tests adversarial robustness of safety mechanisms, not model performance on knowledge benchmarks."
    235       }
    236     },
    237     "human_studies": {
    238       "pre_registered": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants were involved in this study."
    242       },
    243       "irb_or_ethics_approval": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants were involved in this study."
    247       },
    248       "demographics_reported": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants were involved in this study."
    252       },
    253       "inclusion_exclusion_criteria": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants were involved in this study."
    257       },
    258       "randomization_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants were involved in this study."
    262       },
    263       "blinding_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants were involved in this study."
    267       },
    268       "attrition_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants were involved in this study."
    272       }
    273     },
    274     "cost_and_practicality": {
    275       "inference_cost_reported": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "The paper acknowledges cost is relevant (Limitation #12: 'As the cognitive load increases, the attack becomes more costly due to the higher number of tokens generated') and mentions API rate limits (App. A.8), but does not quantify actual API costs, tokens consumed per attack, or wall-clock time."
    279       },
    280       "compute_budget_stated": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No total computational budget is stated. The paper does not report total API spend, number of API calls, or total tokens consumed across all experiments despite acknowledging the cost implications."
    284       }
    285     }
    286   },
    287   "claims": [
    288     {
    289       "claim": "Cognitive Load Theory from neuroscience applies to LLMs, and increasing cognitive load degrades LLM performance on secondary tasks similarly to human cognition.",
    290       "evidence": "Section 3.4.2: Paired t-test showing statistically significant decrease in scores from CLi to CLi+1 (t=3.1248, p=0.0048) on Vicuna MT Benchmark with 100 questions across 4 models. Figures 2-4 show visual degradation of code output. Figure 7 shows average score decline.",
    291       "supported": "moderate"
    292     },
    293     {
    294       "claim": "Cognitive overload attacks can jailbreak LLMs with attack success rates up to 99.99%.",
    295       "evidence": "Table 1: Claude-3-Opus achieved 99.99% ASR on Forbidden Question Dataset (232/232 questions). Table 2: Claude-3-Opus achieved 97% ASR on JailbreakBench. However, ASR varies widely: Gemini-1.0-Pro achieved only 75.43% and 49% on the two datasets respectively.",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "The attack is effective across multiple state-of-the-art models including GPT-4, Claude-3.5 Sonnet, Claude-3 OPUS, Llama-3-70B-Instruct, Gemini-1.0-Pro, and Gemini-1.5-Pro.",
    300       "evidence": "Tables 1 and 2 show results across 6 models. However, Claude-3.5-Sonnet required entirely different CL combinations (CL7-CL11) designed through trial and error and achieved only 53% ASR (Table 6). The original CL1-CL6 completely failed against Claude-3.5-Sonnet (Section 4.3).",
    301       "supported": "weak"
    302     },
    303     {
    304       "claim": "Higher-capability models can craft cognitive overload prompts to attack other LLMs, demonstrating transferability.",
    305       "evidence": "Section 4.4 and App. A.9: Claude-3.5-Sonnet was used to create a CL prompt with an encryption algorithm that successfully jailbroke GPT-4 (Figure 23 shows a successful attack). However, this is a single proof-of-concept example, not a systematic evaluation.",
    306       "supported": "weak"
    307     },
    308     {
    309       "claim": "Llama Guard-2 8B guardrail completely failed to identify harmful input prompts during cognitive overload attacks.",
    310       "evidence": "App. A.7: 'All harmful inputs were incorrectly classified as SAFE. Llama Guard failed to identify any harmfulness in our attack input prompt.' For output filtering, Llama Guard achieved up to 45% jailbreak rate (Table 5).",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "Irrelevant token generation increases cognitive load in LLMs.",
    315       "evidence": "Section 3.4.3: Statistical paired t-test showing significant increase in CL token counts as CL combination increased (p<0.05 for both GPT-4 and Llama-3 tokenizers). Figure 14 shows token count progression.",
    316       "supported": "moderate"
    317     }
    318   ],
    319   "methodology_tags": [
    320     "benchmark-eval"
    321   ],
    322   "key_findings": "The paper proposes a jailbreaking attack inspired by Cognitive Load Theory from neuroscience, demonstrating that adding irrelevant processing tasks before a hidden harmful question can bypass LLM safety mechanisms. Attack success rates ranged from 49% (Gemini-1.0-Pro on JailbreakBench) to 99.99% (Claude-3-Opus on Forbidden Question Dataset), though the attack required significantly different prompt designs for Claude-3.5-Sonnet and achieved only 53% ASR against it. The paper also shows that Llama Guard-2 8B guardrails completely failed to detect harmful inputs when obfuscated with cognitive load tasks, and that LLM performance on secondary tasks degrades progressively with increasing cognitive load.",
    323   "red_flags": [
    324     {
    325       "flag": "Cherry-picked headline result",
    326       "detail": "The abstract highlights 'up to 99.99%' ASR, which is the single best result (Claude-3-Opus on Forbidden Question Set). ASR ranges from 49% to 99.99% across models and datasets. The 53% ASR on Claude-3.5-Sonnet, requiring entirely different attack prompts, is not mentioned in the abstract."
    327     },
    328     {
    329       "flag": "Anthropomorphic framework without empirical grounding",
    330       "detail": "The core theoretical contribution—that LLMs experience 'cognitive overload' analogous to humans—is an analogy, not a verified mechanism. The paper does not provide evidence that the same cognitive processes are at work. Performance degradation with complex prompts could be explained by simpler mechanisms (attention dilution, context length limitations, instruction-following difficulty) without invoking cognitive load theory."
    331     },
    332     {
    333       "flag": "No human evaluation of attack outputs",
    334       "detail": "All evaluation of whether responses are 'harmful' is done by LLM judges. The paper acknowledges this limitation (#8) and notes judge disagreements vary significantly (#9, #10). The evaluation prompt wording affects ASR (SAFE/UNSAFE vs. SAFE/UNSAFE/NEUTRAL). Tables 3-4 show substantial disagreement among additional judge LLMs."
    335     },
    336     {
    337       "flag": "Attack overfitting for Claude-3.5-Sonnet",
    338       "detail": "The CL1-CL6 attack completely failed against Claude-3.5-Sonnet. New CL7-CL11 combinations were designed through 'experimental trial and error' specifically for this model (App. A.8). This is test-time attack tuning, not a generalizable attack framework."
    339     },
    340     {
    341       "flag": "Single-run results without variance reporting",
    342       "detail": "All ASR results appear to be from single experimental runs. No repeated trials, no standard deviations, and no indication of result stability. API-based experiments with stochastic models (even at temperature=0, some providers inject randomness) should include multiple runs."
    343     },
    344     {
    345       "flag": "No cost reporting despite high-cost attack",
    346       "detail": "The attack requires generating thousands of tokens per question across multiple CL levels and models, plus judge evaluations. The paper acknowledges cost as a limitation but never quantifies it, making practical assessment impossible."
    347     }
    348   ],
    349   "cited_papers": [
    350     {
    351       "title": "Many-shot jailbreaking",
    352       "authors": ["Cem Anil", "Esin Durmus", "Mrinank Sharma"],
    353       "year": 2024,
    354       "relevance": "Demonstrates jailbreaking through overwhelming models with adversarial examples, a related cognitive overload mechanism."
    355     },
    356     {
    357       "title": "Jailbroken: How does LLM safety training fail?",
    358       "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
    359       "year": 2023,
    360       "arxiv_id": "2307.02483",
    361       "relevance": "Foundational analysis of competing objectives and mismatched generalization in LLM safety, directly cited to explain cognitive overload attack mechanism."
    362     },
    363     {
    364       "title": "Do anything now: Characterizing and evaluating in-the-wild jailbreak prompts on large language models",
    365       "authors": ["Xinyue Shen", "Zeyuan Chen", "Michael Backes"],
    366       "year": 2023,
    367       "arxiv_id": "2308.03825",
    368       "relevance": "Source of the Forbidden Question Set benchmark used in the paper's evaluation."
    369     },
    370     {
    371       "title": "Jailbreakbench: An open robustness benchmark for jailbreaking large language models",
    372       "authors": ["Patrick Chao", "Edoardo Debenedetti", "Alexander Robey"],
    373       "year": 2024,
    374       "relevance": "Source of the JailbreakBench dataset (100 harmful questions) used in the paper's evaluation."
    375     },
    376     {
    377       "title": "Prompt injection attack against LLM-integrated applications",
    378       "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li"],
    379       "year": 2023,
    380       "arxiv_id": "2306.05499",
    381       "relevance": "Comprehensive analysis of prompt injection threats against LLM-integrated applications."
    382     },
    383     {
    384       "title": "More than you've asked for: A comprehensive analysis of novel prompt injection threats to application-integrated large language models",
    385       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra"],
    386       "year": 2023,
    387       "arxiv_id": "2302.12173",
    388       "relevance": "Foundational work on indirect prompt injection threats to LLM-integrated systems."
    389     },
    390     {
    391       "title": "Cognitive Overload: Jailbreaking Large Language Models with Overloaded Logical Thinking",
    392       "authors": ["Nan Xu", "Fei Wang", "Ben Zhou"],
    393       "year": 2023,
    394       "arxiv_id": "2311.09827",
    395       "relevance": "Prior concurrent work on cognitive overload jailbreaking with three variants (multilingual, veiled expressions, effect-to-cause reasoning) that this paper extends."
    396     },
    397     {
    398       "title": "Jailbreaking black box large language models in twenty queries",
    399       "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban"],
    400       "year": 2023,
    401       "arxiv_id": "2310.08419",
    402       "relevance": "Automated black-box jailbreaking attack demonstrating that adversarial prompts can be generated with minimal human intervention."
    403     },
    404     {
    405       "title": "Universal and transferable adversarial attacks on aligned language models",
    406       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini"],
    407       "year": 2023,
    408       "arxiv_id": "2307.15043",
    409       "relevance": "Influential gradient-based adversarial attack on aligned language models demonstrating universal transferability."
    410     },
    411     {
    412       "title": "Llama Guard: LLM-based input-output safeguard for human-AI conversations",
    413       "authors": ["Hakan Inan", "Kartikeya Upasani", "Jianfeng Chi"],
    414       "year": 2023,
    415       "arxiv_id": "2312.06674",
    416       "relevance": "The guardrail system evaluated in this paper's attack experiments, showing vulnerability to cognitive overload prompts."
    417     },
    418     {
    419       "title": "Harmbench: A standardized evaluation framework for automated red teaming and robust refusal",
    420       "authors": ["Mantas Mazeika", "Long Phan", "Xuwang Yin"],
    421       "year": 2024,
    422       "arxiv_id": "2402.04249",
    423       "relevance": "Standardized evaluation framework for red teaming LLMs, one of the sources for JailbreakBench questions."
    424     },
    425     {
    426       "title": "Tree of attacks: Jailbreaking black-box LLMs automatically",
    427       "authors": ["Anay Mehrotra", "Manolis Zampetakis", "Paul Kassianik"],
    428       "year": 2023,
    429       "arxiv_id": "2312.02119",
    430       "relevance": "Automated jailbreaking framework using tree-based search, relevant to automated adversarial attack methods on LLMs."
    431     },
    432     {
    433       "title": "Sandwich attack: Multi-language mixture adaptive attack on LLMs",
    434       "authors": ["Bibek Upadhayay", "Vahid Behzadan"],
    435       "year": 2024,
    436       "arxiv_id": "2404.07242",
    437       "relevance": "Related work by the same authors demonstrating language-switching attacks on LLMs, which informs the cognitive load framework."
    438     }
    439   ]
    440 }

Impressum · Datenschutz