ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30590B)


      1 {
      2   "paper": {
      3     "title": "Jatmo: Prompt Injection Defense by Task-Specific Finetuning",
      4     "authors": [
      5       "Julien Piet",
      6       "Maha Alrashed",
      7       "Chawin Sitawarin",
      8       "Sizhe Chen",
      9       "Zeming Wei",
     10       "Elizabeth Sun",
     11       "Basel Alomair",
     12       "David Wagner"
     13     ],
     14     "year": 2023,
     15     "venue": "European Symposium on Research in Computer Security",
     16     "arxiv_id": "2312.17673",
     17     "doi": "10.48550/arXiv.2312.17673"
     18   },
     19   "scan_version": 3,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "Jatmo fine-tunes non-instruction-tuned base models on task-specific datasets to defend against prompt injection attacks. Across 7 tasks, Jatmo models match GPT-3.5-Turbo quality (within 2%) while reducing prompt injection attack success from 87% to under 0.5%. Synthetic dataset generation with as few as one real example produces models achieving 96% of teacher model quality. The defense exploits the fact that base models have never been instruction-tuned and thus do not follow injected instructions.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Code released at https://github.com/wagner-group/prompt-injection-defense, referenced in both the abstract and footnote 4."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "All evaluation datasets are publicly available standard benchmarks: The Stack, IMDB, Amazon Reviews, Gutenberg, CNN/DM, Jigsaw, and STS (Table 1). The paper uses these unmodified."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided in the paper. The paper mentions using OpenAI APIs but does not document the full software environment."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "While code is released and the methodology is described, the paper does not include step-by-step reproduction instructions or a README-style guide for replicating experiments."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Table 2 and all figures report point estimates only. No confidence intervals or error bars are provided for any results."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No statistical significance tests are used. Comparisons between Jatmo and GPT-3.5-Turbo are made by directly comparing percentages without any formal testing."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Results are reported with baseline context enabling effect size interpretation: e.g., attack success drops from 87% (GPT-3.5-Turbo) to 0.5% (Jatmo), and quality is reported relative to the teacher model (e.g., '2% lower', 'Same')."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification is given for why 400 training examples were chosen beyond empirical convergence shown in Fig. 4. No power analysis or principled sample size determination is discussed."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. All results appear to be single-run numbers."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "GPT-3.5-Turbo serves as the primary baseline for both quality and security comparisons throughout the paper (Table 2)."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "GPT-3.5-Turbo was a current, competitive model at the time of publication (late 2023). The paper also uses GPT-4 for synthetic data generation."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Fig. 4 shows quality at different training set sizes (10, 50, 100, 200, 400). Section 5.4 compares zero-shot vs one-shot synthetic dataset generation. Fig. 5 compares temperature settings (T=0.7 vs T=1.0). These collectively show which components and settings matter."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Two distinct evaluation dimensions are used: quality metrics (accuracy for classification tasks, GPT-3.5 rating for generative tasks) and security metrics (prompt injection attack success rate)."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No human evaluation of model outputs is performed. Quality assessment relies entirely on automated metrics: ground-truth accuracy for classification and GPT-3.5-Turbo ratings for generative tasks. The authors manually inspect one task's results for attack analysis but do not systematically evaluate output quality with humans."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 4 states 'we reserve part of the dataset for quality and prompt injection evaluations.' Section 5.4 explicitly separates 800 training, 100 evaluation, and 100 test examples for synthetic experiments."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Table 2 provides per-task breakdowns across all 7 tasks for both quality and attack success rates. Attack results are further broken down by injection position (start, middle, end)."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper discusses the 2% attack success on review summarization (the longest injections succeeding) and manually inspects news summarization, finding 'four partially successful attacks among the total 3900 tested injections.' Zero-shot synthetic data struggles are also discussed."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The paper reports that zero-shot synthetic datasets underperform one-shot for news summarization, that temperature T=1.0 produces unstable results for translation, and that some tasks require more training data than others."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The abstract claims '<0.5% attack success vs 87% for GPT-3.5-Turbo' and 'similar quality' — both are directly supported by Table 2 showing 0-2% attack success rates and quality within 2% of GPT-3.5-Turbo."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper claims the defense works 'because F is fine-tuned from a non-instruction-tuned LLM, it has never been trained to search for and follow instructions.' This causal mechanism is not isolated — the paper does not test fine-tuning an instruction-tuned model to verify that instruction tuning is the causal factor. Confounds include model size, architecture, and capability differences between davinci-002 and GPT-3.5-Turbo."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title claims a general 'Prompt Injection Defense' and the paper calls Jatmo 'the first effective defense against prompt injection attacks,' but experiments use only one base model (davinci-002), 7 tasks, and 13 attack templates. The generalization to other models, tasks, and more sophisticated attacks is not bounded."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper does not consider alternative explanations for why the defense works. For instance, it does not discuss whether the defense might simply be due to the base model being less capable, whether fine-tuning on a narrow distribution creates a form of out-of-distribution rejection, or whether the same effect could be achieved with a simpler approach."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper explicitly acknowledges the proxy gap in their security metric: 'Our prompt-injection metric has a tendency to underestimate attack effectiveness: outputs containing anything else than HACKED are marked as unsuccessful.' They also justify using LLM ratings as a proxy for output quality."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The base model 'davinci-002' is a specific model identifier, but 'GPT-3.5-Turbo' and 'GPT-4' are used without version snapshots or API version dates. Model behavior changes across versions of these APIs."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Appendix A.1 provides the exact task prompts and best injection prompts for all 7 tasks. Appendix A.2 provides the full synthetic dataset generation prompts including system prompt, seed generation, and formatting prompts."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "Inference temperature is discussed (T=0.7 vs T=1.0) but fine-tuning hyperparameters (learning rate, epochs, batch size, etc.) are not reported. The OpenAI fine-tuning API settings are not specified."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. Jatmo is a fine-tuning pipeline, not an agentic system."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 4.1 and Fig. 3 document the full synthetic data pipeline: seed generation, input generation, input formatting. Section 5.1 describes how HackAPrompt attacks were filtered ('kept the most generic ones, level 1, ranked by success rate, kept top 10') and how test cases were constructed (13 attacks × 3 positions = 39 variants per test sample)."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 6 'Discussion' contains a 'Limitations' subsection discussing single-task constraint, scope limited to prompt injection (not jailbreak), and acknowledging potential for more sophisticated attacks."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The limitations section identifies specific threats: 'Single-task models sacrifice versatility,' 'Jatmo only defends against prompt-injection attacks and is not designed to prevent jailbreak attacks,' and 'it is possible that there might be more sophisticated attacks we didn't think of, and we welcome further security evaluation.'"
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 1 explicitly states: 'We focus on defending against prompt injection attacks on LLM-integrated applications. Defending against prompt injection in web chat is beyond the scope of this paper.' Section 6 adds: 'Jatmo only defends against prompt-injection attacks and is not designed to prevent jailbreak attacks on alignment or adversarial examples.'"
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "While input datasets are public benchmarks, the generated fine-tuning outputs from GPT-3.5, the fine-tuned models themselves, and raw experimental results (per-sample attack outcomes) are not explicitly made available for independent verification."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 4 describes the three-stage pipeline: dataset collection from standard benchmarks, output generation using the teacher model, and fine-tuning. Section 4.1 details synthetic dataset generation. Section 5.1 details test set construction."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. All data comes from standard public benchmarks."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The full pipeline is documented: input collection (from standard datasets or synthetic generation) → output generation via teacher model (Ri = M(P+Di)) → train/eval/test split → fine-tuning → evaluation with injected test sets (39 variants per sample). Section 5.4 specifies 1000 synthetic inputs split into 800 training, 100 evaluation, 100 testing."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Acknowledgements section lists: KACST-UCB Joint Center on Cybersecurity, OpenAI, NSF (grants 2229876 and CNS-2154873), DHS, IBM, C3.ai Digital Transformation Institute, Open Philanthropy, and Google."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are clearly listed: UC Berkeley, King Abdulaziz City for Science and Technology, and Peking University."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "OpenAI is listed as a funder, and the entire evaluation uses OpenAI models (GPT-3.5-Turbo as teacher, GPT-4 for synthetic generation, davinci-002 as base). OpenAI has a financial interest in demonstrating that their models can be used securely."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial disclosure statement is present in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "The paper tests a defense mechanism against prompt injection, not model knowledge on benchmarks. Quality evaluation is secondary and uses the teacher model's own labels, making contamination less relevant to the claims."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "The paper tests defenses rather than model knowledge. The fine-tuned models are trained on task-specific data generated by the teacher model, not evaluated on pre-training knowledge."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "The paper tests defenses rather than model knowledge. Contamination of benchmarks in pre-training data is not relevant to the prompt injection defense claims."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "The paper claims 'no extra runtime overhead' and suggests cost savings from smaller models, but does not quantify actual inference costs, API costs, or latency for any experiments."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "No GPU hours, fine-tuning time, API costs, or total computational budget is reported despite fine-tuning multiple models across 7 tasks."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No results across multiple random seeds are reported. All results appear to be from single experimental runs."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The number of experimental runs is not stated. It is unclear whether results are from single or multiple runs."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No hyperparameter search budget is reported. Fine-tuning hyperparameters are not even listed, let alone search procedures."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": true,
    320         "justification": "The choice of 400 training examples is justified by the convergence analysis in Fig. 4 showing quality plateaus. Temperature selection (T=0.7) is justified by comparison with T=1.0 in Fig. 5."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "Section 5.1 explicitly addresses evaluation bias: 'we fine-tune Jatmo models on GPT-3.5-Turbo-generated labels instead of the ground truth' to avoid unfairly inflating Jatmo's apparent quality. They note 'This would unfairly inflate the apparent quality of our task-specific models.'"
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "Performance is shown as a function of training set size (Fig. 4) but not as a function of compute budget. The compute cost of fine-tuning Jatmo models vs. prompting GPT-3.5-Turbo is not compared."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": true,
    340         "justification": "The paper discusses the validity of its security metric: 'Our prompt-injection metric has a tendency to underestimate attack effectiveness: outputs containing anything else than HACKED are marked as unsuccessful.' They also manually inspect outputs to validate this concern."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "No scaffolding is involved. Jatmo is a direct model fine-tuning approach without agentic components."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "The paper does not discuss whether davinci-002's pre-training data included examples from the evaluation datasets (IMDB, CNN/DM, etc.), which were all published well before the model's training cutoff."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether the evaluation setup leaks information. For example, the format of injected prompts might provide distributional cues that help the fine-tuned model reject them for reasons unrelated to instruction-following immunity."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of whether training and test data share structural similarities beyond the explicit train/test split. The fine-tuning data and test data come from the same datasets."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No concrete leakage detection or prevention method is applied beyond the basic train/test split."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Jatmo models provide similar quality outputs as GPT-3.5-Turbo while being resilient to prompt injection attacks",
    374       "evidence": "Table 2 shows all Jatmo models within 2% of GPT-3.5-Turbo quality. Attack success rate drops from 87% average against GPT-3.5-Turbo to under 0.5% against Jatmo models (only 2 successful injections out of 23,400 tested).",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Only two prompt-injected inputs out of 23,400 succeeded against a Jatmo model",
    379       "evidence": "Table 2 and Section 5.2 report that the only successful attack was 2% of cases against review summarization with end-position injection. Manual inspection of news summarization found only 4 partially successful attacks in 3,900 tests.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Jatmo achieves 96% of teacher model quality using one real example and 800 synthetic examples",
    384       "evidence": "Fig. 5 shows one-shot synthetic models achieving scores within 4% of GPT-3.5-Turbo at T=0.7 for tested tasks (Section 5.4).",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "A single real example is sufficient for synthetic dataset generation to match real data quality",
    389       "evidence": "Section 5.4 and Fig. 5 compare zero-shot vs one-shot synthetic models. One-shot matches GPT-3.5 quality for review summarization and approaches it for news summarization, while zero-shot struggles on tasks with specific formatting (Section 5.4).",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Jatmo is the first effective defense against prompt injection attacks",
    394       "evidence": "Section 1 states 'we present what is (as far as we are aware) the first effective defense against prompt injection attacks.' The related work section (Section 2) surveys existing approaches and finds no effective prior defenses.",
    395       "supported": "weak"
    396     },
    397     {
    398       "claim": "The defense works because base models have not undergone instruction tuning and thus cannot follow injected instructions",
    399       "evidence": "The theoretical argument is presented in Sections 1 and 4, but the causal mechanism is not experimentally isolated. No experiment tests whether fine-tuning an instruction-tuned model would also resist injection, which would be needed to confirm that instruction-tuning absence is the causal factor.",
    400       "supported": "weak"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "OpenAI funding while evaluating OpenAI models",
    406       "detail": "OpenAI is listed as a funder in acknowledgements, and the entire evaluation relies on OpenAI models (GPT-3.5-Turbo, GPT-4, davinci-002). This creates a conflict of interest where the funder benefits from showing their models can be used securely."
    407     },
    408     {
    409       "flag": "No statistical significance testing",
    410       "detail": "All comparisons between Jatmo and GPT-3.5-Turbo are made by directly comparing percentages without any formal statistical tests, confidence intervals, or multiple-run variance estimates."
    411     },
    412     {
    413       "flag": "Limited attack diversity",
    414       "detail": "Only 13 attack templates (10 from HackAPrompt level 1 + 3 hand-crafted) are used. These are relatively simple template-based attacks. More sophisticated adaptive attacks (e.g., gradient-based, model-aware) are not tested."
    415     },
    416     {
    417       "flag": "Single base model tested",
    418       "detail": "Only davinci-002 is used as the base model. The claim that non-instruction-tuned models are immune to prompt injection is not tested on other base models (e.g., Llama base, Falcon base), limiting the generalizability of the defense."
    419     },
    420     {
    421       "flag": "Causal mechanism not isolated",
    422       "detail": "The paper attributes the defense to the base model's lack of instruction tuning, but does not test the obvious control: fine-tuning an instruction-tuned model with the same process. Without this comparison, it's unclear whether the defense comes from the base model property or from task-specific fine-tuning itself."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    428       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    429       "year": 2023,
    430       "arxiv_id": "2302.12173",
    431       "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications, directly motivating Jatmo's defense approach."
    432     },
    433     {
    434       "title": "Ignore previous prompt: Attack techniques for language models",
    435       "authors": ["Fábio Perez", "Ian Ribeiro"],
    436       "year": 2022,
    437       "relevance": "Early work categorizing prompt injection attacks (goal hijacking and prompt leaking), providing the attack taxonomy used in this paper."
    438     },
    439     {
    440       "title": "Ignore This Title and HackAPrompt: Exposing Systemic Vulnerabilities of LLMs through a Global Scale Prompt Hacking Competition",
    441       "authors": ["Sander Schulhoff"],
    442       "year": 2023,
    443       "arxiv_id": "2311.16119",
    444       "relevance": "Source of the HackAPrompt dataset used as one of the two attack sets for evaluating Jatmo's security."
    445     },
    446     {
    447       "title": "Tensor Trust: Interpretable Prompt Injection Attacks from an Online Game",
    448       "authors": ["Sam Toyer"],
    449       "year": 2023,
    450       "arxiv_id": "2311.01011",
    451       "relevance": "Gamified approach to collecting prompt injection attacks and defenses, relevant to evaluation methodology."
    452     },
    453     {
    454       "title": "SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks",
    455       "authors": ["Alexander Robey", "Eric Wong", "Hamed Hassani", "George J. Pappas"],
    456       "year": 2023,
    457       "arxiv_id": "2310.03684",
    458       "relevance": "Alternative defense using randomized smoothing, providing contrast to Jatmo's fine-tuning approach."
    459     },
    460     {
    461       "title": "Prompt Injection Attacks and Defenses in LLM-Integrated Applications",
    462       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    463       "year": 2023,
    464       "arxiv_id": "2310.12815",
    465       "relevance": "Comprehensive survey of prompt injection attacks and defenses, providing context for the threat landscape Jatmo addresses."
    466     },
    467     {
    468       "title": "Prompt Injection Attack against LLM-integrated Applications",
    469       "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li", "Kailong Wang"],
    470       "year": 2023,
    471       "arxiv_id": "2306.05499",
    472       "relevance": "Demonstrates prompt injection attacks against real-world LLM-integrated applications."
    473     },
    474     {
    475       "title": "Baseline Defenses for Adversarial Attacks Against Aligned Language Models",
    476       "authors": ["Neel Jain", "Avi Schwarzschild", "Yuxin Wen"],
    477       "year": 2023,
    478       "arxiv_id": "2309.00614",
    479       "relevance": "Proposes baseline defenses including paraphrasing and perplexity detection, which Jatmo aims to improve upon."
    480     },
    481     {
    482       "title": "Jailbroken: How Does LLM Safety Training Fail?",
    483       "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
    484       "year": 2023,
    485       "arxiv_id": "2307.02483",
    486       "relevance": "Analyzes failure modes of LLM safety training including string obfuscation techniques that could bypass input sanitization defenses."
    487     },
    488     {
    489       "title": "Training language models to follow instructions with human feedback",
    490       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
    491       "year": 2022,
    492       "arxiv_id": "2203.02155",
    493       "relevance": "InstructGPT paper describing the instruction tuning process that Jatmo's defense specifically targets by avoiding."
    494     },
    495     {
    496       "title": "Certifying LLM Safety against Adversarial Prompting",
    497       "authors": ["Aounon Kumar", "Chirag Agarwal", "Suraj Srinivas", "Soheil Feizi", "Himabindu Lakkaraju"],
    498       "year": 2023,
    499       "arxiv_id": "2309.02705",
    500       "relevance": "Proposes certified defenses against adversarial prompts using LLM-based detection, complementary to Jatmo's fine-tuning approach."
    501     }
    502   ],
    503   "engagement_factors": {
    504     "practical_relevance": {
    505       "score": 3,
    506       "justification": "Released open-source tool with code that serves as a drop-in replacement for LLM API calls in applications, directly applicable to production systems."
    507     },
    508     "surprise_contrarian": {
    509       "score": 1,
    510       "justification": "The approach is intuitive — using non-instruction-tuned models to avoid instruction-following vulnerabilities — rather than counterintuitive."
    511     },
    512     "fear_safety": {
    513       "score": 2,
    514       "justification": "Demonstrates that 87% of prompt injection attacks succeed against GPT-3.5-Turbo, highlighting a major security concern for deployed LLM applications."
    515     },
    516     "drama_conflict": {
    517       "score": 0,
    518       "justification": "No controversy or conflict angle; straightforward defense paper."
    519     },
    520     "demo_ability": {
    521       "score": 2,
    522       "justification": "Code released on GitHub but requires OpenAI API access and fine-tuning setup, not a simple pip-install-and-run experience."
    523     },
    524     "brand_recognition": {
    525       "score": 1,
    526       "justification": "UC Berkeley is well-known in AI research but not a major industry AI lab. Uses OpenAI models which adds some brand recognition."
    527     }
    528   }
    529 }

Impressum · Datenschutz