ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (31604B)


      1 {
      2   "paper": {
      3     "title": "MELON: Provable Defense Against Indirect Prompt Injection Attacks in AI Agents",
      4     "authors": [
      5       "Kaijie Zhu",
      6       "Xianjun Yang",
      7       "Jindong Wang",
      8       "Wenbo Guo",
      9       "William Wang"
     10     ],
     11     "year": 2025,
     12     "venue": "International Conference on Machine Learning",
     13     "arxiv_id": "2502.05174"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "MELON detects indirect prompt injection attacks by comparing agent tool calls between an original execution and a masked re-execution where the user prompt is replaced with a task-neutral prompt. On the AgentDojo benchmark across GPT-4o, o3-mini, and Llama-3.3-70B, MELON-Aug achieves 0.32% attack success rate while maintaining 68.72% utility on GPT-4o, outperforming five baseline defenses. Ablation studies confirm each design component (customized masking function, tool call cache, focused tool call comparison) contributes to performance, and the method shows insensitivity to hyperparameter choices.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract states 'Code is available at https://github.com/kaijiezhu11/MELON' and provides a working URL."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The evaluation uses the publicly available AgentDojo benchmark (Debenedetti et al., 2024), which is an open framework. No proprietary data was collected."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed dependency specifications are mentioned in the paper."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but the paper itself contains no commands or reproduction guide."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "All results in Tables 1-4 and Figures 1, 3 report only point estimates (e.g., '0.32% ASR', '68.72% UA') with no confidence intervals, error bars, or uncertainty measures."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Claims like 'MELON outperforms SOTA defenses' and 'significantly outperforms five SOTA defenses' are based solely on comparing point estimates in Table 1. No statistical significance tests (p-values, t-tests, etc.) are reported."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Absolute performance numbers are reported for all methods with baseline context. For example, 'MELON-Aug achieves 68.72% UA with 0.32% ASR, compared to the no defense baseline (69.08% UA, 16.06% ASR)' (Section 4.2). Tables provide full breakdowns enabling comparison."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The evaluation uses 629 attack cases and 97 benign tasks from AgentDojo, but no justification is given for why this sample size is sufficient for the claims being made, nor is a power analysis discussed."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Temperature is set to 0 for all models to 'avoid randomness' (Section 4.1), but the paper reports single-run numbers with no variance, standard deviation, or spread measures across multiple runs."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Five baseline defenses are compared: DeBERTa Detector, LLM Detector, Delimiting, Repeat Prompt, and Tool Filter (Section 4.1, Table 1)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Baselines include recent works: AgentDojo tool filter (2024), ProtectAI DeBERTa detector (2024), spotlighting/delimiting (Hines et al., 2024), and repeat prompt (2023). These represent the current state of the art."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 4.3 conducts a systematic ablation study removing each of the three key designs: the masking function (Basic), tool call cache (No Cache), and focused comparison (Full Comp.). Results in Table 2 show each component's contribution."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Three evaluation metrics are used: Utility under Attack (UA), Attack Success Rate (ASR), and Benign Utility (BU), as defined in Section 4.1."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Evaluation is entirely automated through the AgentDojo benchmark. No human evaluation of the defense's outputs or decisions is conducted."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "All evaluations, including hyperparameter sensitivity analysis for θ (Table 4) and masking prompt variations (Table 3), are conducted on the same AgentDojo dataset. No separate validation set is used for tuning vs. testing."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 1 provides breakdowns by attack type (Direct, Ignore Previous, System Message, Important Messages), by model (GPT-4o, o3-mini, Llama-3.3-70B), and by agent type for false positive analysis (banking, slack, travel, workspace in Appendix E)."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 4.5 analyzes 66 attack success cases identifying four failure patterns: Response-Based Attacks (72.73%), Tool Call Redundancy (15.15%), State Hallucination (6.06%), and Function Hallucination (6.06%)."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 4.4 reports MELON's 9.28% false positive rate. Section 4.2 acknowledges BU decreases for MELON in some cases. Table 2 shows the basic masking approach (without their improvements) has higher ASR. Section 4.5 reports 66 failure cases."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims MELON 'outperforms SOTA defenses in both attack prevention and utility preservation' which is supported by Table 1 and Figure 1 showing lowest average ASR (0.24% for MELON, 0.32% for MELON-Aug on GPT-4o) with competitive UA."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims about component contributions ('each component is essential for effective detection') are justified through the ablation study in Table 2, which uses controlled single-variable removal of each design component."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title claims 'Provable Defense Against Indirect Prompt Injection Attacks in AI Agents' — an unbounded generalization. Results are limited to one benchmark (AgentDojo), 3 models, and 4 general attack types. The paper does not bound claims to these settings. The theoretical 'provable' guarantee (Section 3.4) relies on assumptions (µB < θ < µV) not empirically verified."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper does not consider alternative explanations for MELON's success, such as whether AgentDojo's attack patterns are inherently amenable to masking-based detection, or whether the results would hold under more sophisticated adaptive attacks beyond the four general attack types tested."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The metrics (UA, ASR, BU) directly measure what is claimed: attack prevention (ASR), utility maintenance under attack (UA), and normal utility (BU). No proxy gap exists between measurements and claims."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper uses 'GPT-4o', 'o3-mini', and 'Llama-3.3-70B' without specifying API snapshot dates, version identifiers, or model release dates. Per the schema, marketing names like 'GPT-4o' without a snapshot date do not count."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix A.1 provides the complete task-neutral prompt Tf. Appendix A.2 provides full few-shot examples. Appendix C shows complete attack prompts and defense prompts. The LLM Detector prompt is given in Appendix C.2.2."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Temperature set to 0 for all models (Section 4.1). Similarity threshold θ=0.8 (Section 3.3.1). OpenAI text-embedding-v3 model specified for embeddings. Sensitivity tests on θ from 0.5 to 0.9 (Table 4)."
    158       },
    159       "scaffolding_described": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3.1 formally defines the agent architecture (states, actions, tool calls). Section 3.2-3.3 detail the MELON detection pipeline with Algorithm 1 providing pseudocode. Figure 2 shows the detection pipeline architecture."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 4.1 documents AgentDojo's structure: 4 agent types with 16, 21, 20, 40 user tasks respectively, 629 attack cases formed by pairing user tasks with attack tasks. The tool call transformation for comparison is detailed in Appendix A.3."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 4.5 ('Analysis of Attack Success Cases') provides substantive discussion of where MELON fails, identifying four specific failure patterns with percentages. Section 5 discusses future work addressing current limitations."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 4.5 identifies four specific failure patterns: Response-Based Attacks (72.73% of failures), Tool Call Redundancy (15.15%), State Hallucination (6.06%), Function Hallucination (6.06%). These are specific to this defense mechanism."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper does not explicitly state what the results do not show. It doesn't acknowledge that results are limited to AgentDojo's specific task domains, that only four general attack types were tested, or that adaptive attacks designed specifically against MELON were not evaluated. The 'Provable' claim in the title suggests broader applicability than demonstrated."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "Only aggregate metrics (ASR, UA, BU percentages) are reported in tables. Individual per-task or per-case results are not available for independent verification."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 4.1 describes the AgentDojo benchmark composition: 97 user tasks across 4 agent types, 629 attack cases, 4 attack methods, and 3 LLMs. The benchmark's construction is cited from Debenedetti et al. (2024)."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. The evaluation uses a standard benchmark (AgentDojo)."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The evaluation pipeline is clearly documented: AgentDojo provides user tasks and attack tasks → attacks inject malicious content into tool outputs → defense methods are applied → metrics (UA, ASR, BU) are computed per AgentDojo's definitions. Algorithm 1 details the detection procedure."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Acknowledgements section states: 'This research was funded in part by ARL Grant W911NF-23-2-0137 and the Microsoft Accelerating Foundation Models Research (AFMR) grant program. We thank FAR AI, OpenAI, and Berkeley RDI for their support.'"
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are listed: University of California, Santa Barbara (Zhu, Yang, Guo, Wang) and William & Mary (Wang)."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "The paper thanks OpenAI for support and heavily evaluates OpenAI models (GPT-4o, o3-mini) and uses OpenAI's embedding API. Microsoft (funder) is OpenAI's primary investor. Both have financial interest in demonstrating that LLM agents can be effectively defended against attacks."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "This paper evaluates a defense mechanism (MELON) rather than model knowledge. The benchmark (AgentDojo) tests defense effectiveness, not model capability. Per schema guidance, studies that test defenses/tools rather than model knowledge are NA."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "This paper tests a defense method, not model knowledge on a benchmark. The primary evaluation compares defense strategies while holding the model constant."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "This paper tests a defense method, not model knowledge on a benchmark. Contamination of model training data does not affect the relative comparison of defense methods."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study. Evaluation is entirely automated on the AgentDojo benchmark."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Section 3.3.1 states 'the introduction of the masking run effectively doubles the number of required model calls. This results in a ≈2× increase in API costs compared to the undefended baseline system.' The latency overhead from embedding computation is also discussed."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No total computational budget is stated. The paper does not report total API spend, number of tokens consumed, GPU hours, or total cost for running the experiments across 3 models × 8 defenses × 629 attack cases."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Temperature is set to 0 for all models to avoid randomness (Section 4.1), but no seed sensitivity analysis is performed. API-based models may still exhibit non-determinism even at temperature 0, and this is not explored."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The paper does not state how many experimental runs produced the reported results. Setting temperature to 0 implies single runs, but this is not explicitly stated."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The primary threshold θ=0.8 is used without stating how it was selected. The sensitivity test in Table 4 tests 5 values (0.5-0.9) but this is post-hoc analysis on the test data, not a proper search budget report."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The threshold θ=0.8 is stated as the primary value to 'balance detection sensitivity and false positive rate' (Section 3.3.1), but the selection process is not described. The sensitivity analysis in Table 4 is performed on the same test data used for final evaluation."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Multiple comparisons are made across 4 attack types × 3 models × 8 defense methods without any correction for multiple comparisons (Bonferroni, etc.)."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors implement and evaluate their own system against baselines they also implemented (e.g., LLM Detector, DeBERTa Detector). There is no acknowledgment of self-comparison bias per Lucic et al. (2018)."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "MELON uses approximately 2x compute vs baselines, but performance is not plotted or analyzed as a function of compute budget. Defenses like tool filter or prompt augmentation use much less compute but are compared directly without normalizing for compute."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper chooses AgentDojo because it is 'the latest one containing many diverse attack cases' (Section 4.1) but does not discuss whether AgentDojo's simulated environments adequately represent real-world IPI attack scenarios."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "The defense mechanism IS the scaffold being tested. MELON modifies the agent's execution pipeline, and the comparison is between different defense scaffolds applied to the same base agent. The scaffold is the independent variable."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether AgentDojo tasks or their solutions existed before the models' training cutoffs. If models were trained on similar task patterns, evaluation results may be inflated."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the AgentDojo evaluation setup leaks information to the models. The paper does not analyze whether task descriptions or tool schemas provide hints about expected solutions."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "The 629 attack cases share user tasks across attack types (e.g., the same user task paired with different attacks). Non-independence of these test cases is not discussed."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination is discussed."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "MELON and MELON-Aug achieve the lowest attack success rate while maintaining high utility, outperforming all baseline defense methods.",
    370       "evidence": "Table 1 shows MELON achieves 0.24% average ASR on GPT-4o with 58.78% UA, and MELON-Aug achieves 0.32% ASR with 68.72% UA, compared to the best baseline (Tool Filter at 2.34% ASR, 65.54% UA). Figure 1 and Figure 3 visualize the trade-off.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "MELON-Aug reduces ASR to 0.32% while maintaining 68.72% utility on GPT-4o.",
    375       "evidence": "Table 1, GPT-4o row for MELON-Aug shows averaged UA=68.72% and ASR=0.32%. This compares favorably to no-defense baseline of 69.08% UA and 16.06% ASR.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Each of the three key design components (customized masking function, tool call cache, focused tool call comparison) is essential for effective detection.",
    380       "evidence": "Table 2 ablation study shows removing any component increases ASR: Basic (no Tf) 2.70% vs 0.95%, No Cache 1.75% vs 0.95%, Full Comp. 17.33% vs 0.95%. All variants perform worse than the full MELON.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "MELON's performance is insensitive to key hyperparameters (task-neutral prompt formulation and similarity threshold).",
    385       "evidence": "Table 3 shows ASR ranges from 0.95% to 1.43% across 5 prompt variants. Table 4 shows ASR is 0.95% for thresholds 0.5, 0.7, 0.8, and 0.9, and 1.11% for 0.6. Only tested on GPT-4o with Important Messages attack.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Error rates of the ensemble detector decrease exponentially with the number of weak detectors.",
    390       "evidence": "Section 3.4 provides a theoretical proof using Hoeffding's inequality, showing error bounds of exp(-2n(θ-µB)²) for false positives and exp(-2n(µV-θ)²) for false negatives. However, this applies to the ensemble variant, not the single-detector MELON actually evaluated in experiments.",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "MELON's false positive detections all represent legitimate security concerns rather than detection errors.",
    395       "evidence": "Section 4.4 reports 9 false positives for GPT-4o (9.28% FPR). The authors examined all cases and found they involve user tasks that request execution of unverified external instructions. Three examples are shown in Appendix E.",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "Misleading 'Provable' claim in title",
    402       "detail": "The title claims 'Provable Defense' but the theoretical guarantee (Section 3.4) applies to an ensemble of n detectors, not the single-detector MELON evaluated in experiments. The bounds require µB < θ < µV which is assumed but never empirically verified. The evaluated system and the 'provable' system are different methods."
    403     },
    404     {
    405       "flag": "No held-out validation set",
    406       "detail": "Hyperparameter selection (θ=0.8) and all sensitivity analyses (Tables 3, 4) are performed on the same AgentDojo test data used for final evaluation. This conflates model selection with model evaluation."
    407     },
    408     {
    409       "flag": "No statistical testing despite comparative claims",
    410       "detail": "The paper claims MELON 'significantly outperforms' baselines but provides no significance tests, confidence intervals, or error bars. With temperature=0 single-run evaluation, there is no way to assess result stability."
    411     },
    412     {
    413       "flag": "Funder conflict of interest",
    414       "detail": "The paper thanks OpenAI for support and is funded by Microsoft (OpenAI's primary investor). The evaluation heavily features OpenAI models (GPT-4o, o3-mini) and uses OpenAI's embedding API. Both entities benefit from research showing LLM agents can be defended."
    415     },
    416     {
    417       "flag": "Limited attack diversity",
    418       "detail": "Only four general prompt injection attack types are tested, all from the same AgentDojo benchmark. No adaptive attacks specifically designed to evade MELON are considered. The paper acknowledges excluding agent-specific attacks due to low efficacy, but this limits generalizability claims."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents",
    424       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"],
    425       "year": 2024,
    426       "relevance": "Primary benchmark used for evaluation; defines the IPI attack/defense evaluation framework for LLM agents."
    427     },
    428     {
    429       "title": "StruQ: Defending Against Prompt Injection with Structured Queries",
    430       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    431       "year": 2024,
    432       "arxiv_id": "2402.06363",
    433       "relevance": "Proposes adversarial training defense against prompt injection in LLMs, representing the training-based defense category."
    434     },
    435     {
    436       "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions",
    437       "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"],
    438       "year": 2024,
    439       "arxiv_id": "2404.13208",
    440       "relevance": "Proposes training-based defense that teaches LLMs to prioritize system instructions over injected ones."
    441     },
    442     {
    443       "title": "Adversarial Attacks on Multimodal Agents",
    444       "authors": ["Chejian Xu", "Jiawei Koh", "Ruslan Salakhutdinov", "Daniel Fried", "Aditi Raghunathan"],
    445       "year": 2024,
    446       "arxiv_id": "2406.12814",
    447       "relevance": "Evaluates adversarial attacks on multimodal web agents including indirect prompt injection via images."
    448     },
    449     {
    450       "title": "Defending Against Indirect Prompt Injection Attacks With Spotlighting",
    451       "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"],
    452       "year": 2024,
    453       "arxiv_id": "2403.14720",
    454       "relevance": "Proposes delimiter-based prompt augmentation defense against indirect prompt injection."
    455     },
    456     {
    457       "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents",
    458       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    459       "year": 2024,
    460       "doi": "10.18653/v1/2024.findings-acl.624",
    461       "relevance": "Benchmark for evaluating indirect prompt injection attacks in tool-integrated LLM agents."
    462     },
    463     {
    464       "title": "Identifying the Risks of LM Agents with an LM-Emulated Sandbox",
    465       "authors": ["Yangjun Ruan", "Honghua Dong", "Andrew Wang", "Silviu Pitis", "Yongchao Zhou", "Jimmy Ba", "Yoav Dubois", "Chris J. Maddison", "Tatsunori Hashimoto"],
    466       "year": 2024,
    467       "relevance": "Proposes sandbox-based approach for identifying risks in LLM agents, relevant to agent safety evaluation."
    468     },
    469     {
    470       "title": "The Task Shield: Enforcing Task Alignment to Defend Against Indirect Prompt Injection in LLM Agents",
    471       "authors": ["Feiran Jia", "Tong Wu", "Xinle Qin", "Anna Squicciarini"],
    472       "year": 2024,
    473       "arxiv_id": "2412.16682",
    474       "relevance": "Proposes alignment-check defense against IPI in LLM agents, conceptually similar to MELON's approach of verifying tool call alignment."
    475     },
    476     {
    477       "title": "Agent Security Bench (ASB): Formalizing and Benchmarking Attacks and Defenses in LLM-based Agents",
    478       "authors": ["Hanrong Zhang", "Jingyuan Huang", "Kai Mei", "Yifei Yao", "Zhenting Wang", "Chenlu Zhan", "Hongwei Wang", "Yongfeng Zhang"],
    479       "year": 2024,
    480       "arxiv_id": "2410.02644",
    481       "relevance": "Benchmark for formalizing attacks and defenses in LLM-based agents, directly relevant to agent security evaluation."
    482     },
    483     {
    484       "title": "Aligning LLMs to be Robust Against Prompt Injection",
    485       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "Chuan Guo"],
    486       "year": 2024,
    487       "arxiv_id": "2410.05451",
    488       "relevance": "Proposes alignment-based training defense against prompt injection attacks."
    489     },
    490     {
    491       "title": "A New Era in LLM Security: Exploring Security Concerns in Real-World LLM-Based Systems",
    492       "authors": ["Fangzhou Wu", "Ning Zhang", "Somesh Jha", "Patrick McDaniel", "Chaowei Xiao"],
    493       "year": 2024,
    494       "arxiv_id": "2402.18649",
    495       "relevance": "Surveys security concerns in real-world LLM-based systems including prompt injection."
    496     },
    497     {
    498       "title": "IsolateGPT: An Execution Isolation Architecture for LLM-Based Systems",
    499       "authors": ["Yuhao Wu", "Franziska Roesner", "Tadayoshi Kohno", "Ning Zhang", "Umar Iqbal"],
    500       "year": 2025,
    501       "relevance": "Proposes execution isolation defense for LLM systems, alternative architectural approach to defending against prompt injection."
    502     }
    503   ],
    504   "engagement_factors": {
    505     "practical_relevance": {
    506       "score": 2,
    507       "justification": "Defense method applicable to production LLM agents with code released, but requires 2x compute overhead and integration with agent framework."
    508     },
    509     "surprise_contrarian": {
    510       "score": 1,
    511       "justification": "Novel masking-based detection approach is technically interesting but doesn't challenge widely-held beliefs about AI safety."
    512     },
    513     "fear_safety": {
    514       "score": 3,
    515       "justification": "Directly demonstrates and defends against prompt injection attacks that can redirect AI agents to perform unauthorized actions like sending money or leaking data."
    516     },
    517     "drama_conflict": {
    518       "score": 0,
    519       "justification": "Standard academic defense paper with no controversy, no claims of broken systems or dishonest benchmarks."
    520     },
    521     "demo_ability": {
    522       "score": 2,
    523       "justification": "Code available on GitHub (https://github.com/kaijiezhu11/MELON), though requires API keys and AgentDojo setup to reproduce."
    524     },
    525     "brand_recognition": {
    526       "score": 1,
    527       "justification": "Authors from UCSB and William & Mary, published at ICML. OpenAI support acknowledged but not an OpenAI paper."
    528     }
    529   }
    530 }

Impressum · Datenschutz