ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29201B)


      1 {
      2   "paper": {
      3     "title": "The Task Shield: Enforcing Task Alignment to Defend Against Indirect Prompt Injection in LLM Agents",
      4     "authors": [
      5       "Feiran Jia",
      6       "Tong Wu",
      7       "Xin Qin",
      8       "Anna Squicciarini"
      9     ],
     10     "year": 2024,
     11     "venue": "Annual Meeting of the Association for Computational Linguistics",
     12     "arxiv_id": "2412.16682",
     13     "doi": "10.48550/arXiv.2412.16682"
     14   },
     15   "scan_version": 3,
     16   "active_modules": [
     17     "experimental_rigor",
     18     "data_leakage"
     19   ],
     20   "methodology_tags": [
     21     "benchmark-eval"
     22   ],
     23   "key_findings": "Task Shield reduces attack success rate to 2.07% on GPT-4o against the strongest indirect prompt injection attack (Important Instructions) while maintaining 69.79% task utility, significantly outperforming four baseline defenses on the AgentDojo benchmark. The approach reframes agent security from detecting harmful content to enforcing task alignment, verifying that every agent action contributes to user-specified goals. Results are consistent across GPT-4o and GPT-4o-mini, though evaluation is limited to a single benchmark and model family with single-run experiments.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No repository URL or code archive is provided for the Task Shield implementation. The paper references the AgentDojo benchmark repository but does not release its own code."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The evaluation uses the publicly available AgentDojo benchmark (released under MIT License at https://github.com/ethz-spylab/agentdojo). Baseline results are also drawn from AgentDojo's published data."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Specific model versions are stated (Appendix C.2) and temperature=0.0 is mentioned, but no requirements.txt, Dockerfile, or environment specification is provided for reproducing the Task Shield framework."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided. The paper describes the algorithm (Algorithm 1) and prompts (Appendix D) but does not provide runnable scripts or a README with reproduction commands."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "All results in Tables 1-3 are point estimates (percentages) with no confidence intervals, error bars, or uncertainty measures."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Claims of outperformance (e.g., Task Shield vs. baselines) are based on comparing raw percentages without any statistical significance tests."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Tables 1-2 report both absolute values for Task Shield and baselines (e.g., ASR 2.07% vs. 47.69% no-defense, utility 69.79% vs. 50.08%), providing sufficient context to assess effect magnitude."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper states 'one trial per task' (Section 5) with no justification for this sample size or discussion of power analysis. No rationale is given for why a single trial is sufficient."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "Section 5 states 'one trial per task' — single-run results with no variance, standard deviation, or any spread measure reported."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Four baseline defenses are compared: Data Delimiting, PI Detector, Repeat Prompt (Prompt Sandwiching), and Tool Filter (Section 5.1, Table 2)."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "All baselines are from 2024 papers: Struq (Chen et al., 2024), Palisade (Kokkula et al., 2024), Learn Prompting (2024), and AgentDojo's Tool Filter (Debenedetti et al., 2024)."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Task Shield has three components (instruction extraction, alignment check, feedback generation) but no ablation study is conducted to measure individual component contributions."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Three evaluation metrics are used: Clean Utility (CU), Utility under Attack (U), and Attack Success Rate (ASR), as defined in Section 5.1."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "All evaluation is automated through the AgentDojo benchmark. No human evaluation of Task Shield's outputs or decisions is performed."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The paper does not discuss any development/test split. It is unclear whether the prompts and threshold ε were designed or tuned using the same AgentDojo tasks used for final evaluation."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Tables 1-3 provide per-suite breakdowns across Travel, Workspace, Banking, and Slack task suites, in addition to overall results."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "No qualitative error analysis or examples of where Task Shield fails are provided. The Limitations section mentions potential failure modes (weaker models, adaptive attacks) abstractly but does not examine specific failure cases."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "Every comparison shows Task Shield performing well. No experiments that didn't work, ablations that hurt performance, or abandoned approaches are reported."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims 2.07% ASR and 69.79% utility on GPT-4o, which match Table 2's Overall column for Task Shield under Important Instructions attack. The claim of 'significantly outperforming existing defenses' is supported by the comparative data."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The primary causal claim is that Task Shield reduces ASR. The experimental design—same model, same benchmark, same attack, with and without the defense—constitutes controlled single-variable manipulation adequate for this causal claim."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The title claims 'Defend Against Indirect Prompt Injection in LLM Agents' broadly, but results are limited to one benchmark (AgentDojo), one model family (GPT), and three attack types. The Limitations section acknowledges this but the title and framing are broader than what was tested."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "No alternative explanations for the results are discussed. For example, the paper does not consider whether the low ASR could be due to the specific attack formulations in AgentDojo rather than Task Shield's general effectiveness."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper measures ASR and utility directly on benchmark tasks. These are the same metrics used by the benchmark and directly map to the paper's claims about defense effectiveness and utility preservation. No proxy gap exists."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix C.2 states exact model versions: 'gpt-4o-2024-05-13', 'gpt-4o-mini-2024-07-18', and 'gpt-3.5-turbo-0125'."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Full prompt texts are provided in Appendix D: task extraction prompt (Figure 4), content checker prompt (Figure 5), tool call checker prompt (Figure 6), and feedback prompts (Figure 7)."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "Appendix C.2 reports temperature=0.0 for Task Shield and 'default configurations for the models' for the agent, but does not specify top-p, max tokens, or other sampling parameters. 'Default configurations' is insufficiently specific."
    163       },
    164       "scaffolding_described": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The Task Shield framework is described in detail in Section 4: three components (instruction extraction, alignment check, feedback generation), Algorithm 1 for core processing, and Section 4.2 describes interaction with the LLM agent system at each message level."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The evaluation uses the AgentDojo benchmark in its standard configuration. Section 5.1 and Appendix C.2 describe the setup, including that baseline results come from AgentDojo's published data. No custom preprocessing was applied."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "A dedicated 'Limitations' section appears after the Conclusion, discussing reliance on LLMs, vulnerability to adaptive attacks, and resource constraints limiting evaluation scope."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The Limitations section identifies specific threats: (1) performance degradation with weaker language models, (2) susceptibility to adaptive attacks that could fool the alignment checker, and (3) experiments restricted to a single benchmark and model family due to cost."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The Limitations section explicitly states 'restricted our experiments to a single benchmark and a single model family' and identifies vulnerability to adaptive attacks as an unaddressed threat."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "No raw experimental data (individual task results, model outputs, alignment scores) is released. Only aggregated percentages are reported in the tables."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 5.1 describes the benchmark (AgentDojo with four task suites), three attack types, and the evaluation protocol including one trial per task. Model versions are specified in Appendix C.2."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. Evaluation is entirely automated using the AgentDojo benchmark."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "The paper does not document the full pipeline from running experiments to producing the final tables. It is unclear how the aggregated percentages were computed from individual task results, or how edge cases (e.g., partial task completion) were handled."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding sources or acknowledgments section is present in the paper."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed: Pennsylvania State University, Princeton University, and California State University, Long Beach. These are academic institutions with no apparent commercial conflict."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure statement does not confirm absence of funding."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial disclosure statement appears in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Model versions are specified (gpt-4o-2024-05-13) but no training data cutoff date is stated. It is unknown whether GPT-4o's training data includes AgentDojo-related content."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No discussion of whether GPT-4o's training data might include AgentDojo tasks or similar content that could inflate utility scores."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "AgentDojo was published in June 2024 (arXiv:2406.13352). GPT-4o-2024-05-13 predates this, but early benchmark versions or related content could have been in training data. No contamination analysis is provided."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study. Evaluation is entirely automated using the AgentDojo benchmark."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Task Shield adds multiple LLM calls per message (instruction extraction + alignment checking). The Limitations section mentions 'high cost of LLM queries' but no quantification of cost, latency, or token overhead is provided."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No total computational budget, API costs, or hardware specifications are reported for the experiments."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Section 5 states 'one trial per task' — single-run results with no seed sensitivity analysis."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Section 5 explicitly states 'with one trial per task,' making the number of runs clear."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No hyperparameter search is described. The threshold ε and prompt designs appear to be fixed without reporting any tuning process or search budget."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No discussion of how the Task Shield configuration (prompts, threshold) was selected or whether alternatives were tried."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Multiple comparisons are made across 4 task suites, 5 defense methods, 3 attack types, and 3 models, but no correction for multiple comparisons is applied."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors implement and evaluate their own defense against baselines whose results come from AgentDojo's published data. No acknowledgment of potential self-comparison bias in their implementation."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "Task Shield adds multiple LLM calls per message compared to simpler defenses (e.g., Repeat Prompt adds only prompt text). This compute overhead is not quantified or compared."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of whether AgentDojo's task suites and attack formulations are representative of real-world indirect prompt injection threats. The benchmark is accepted at face value."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "All defense comparisons use the same base model (GPT-4o), isolating the defense mechanism as the variable. The scaffold (defense) IS the thing being tested, and the experimental design keeps other variables constant."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of temporal leakage. GPT-4o (May 2024) predates AgentDojo's publication (June 2024), but earlier versions or related content could have been in training data."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the AgentDojo evaluation setup leaks information that would not be available in real deployment scenarios."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No discussion of independence between AgentDojo's task suites or whether tasks within a suite share structural similarities that could bias results."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No leakage detection or prevention method is applied."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Task Shield reduces attack success rate to 2.07% on GPT-4o against the Important Instructions attack while maintaining 69.79% utility under attack.",
    375       "evidence": "Table 2, Overall column: Task Shield achieves ASR 2.07% and U 69.79% vs. no-defense ASR 47.69% and U 50.08% on GPT-4o.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Task Shield significantly outperforms all baseline defenses in the security-utility tradeoff.",
    380       "evidence": "Table 2 and Figure 3 Pareto front analysis show Task Shield in the desirable lower-right region (low ASR, high utility). Next-best baseline (Tool Filter) achieves 6.84% ASR with 56.28% utility.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Task Shield maintains consistent effectiveness across both GPT-4o and GPT-4o-mini architectures.",
    385       "evidence": "Table 2 shows Task Shield achieving 2.07% ASR on GPT-4o and 2.23% ASR on GPT-4o-mini, with utility roughly matching no-defense baselines for both models.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Task alignment is a novel and orthogonal perspective that reframes agent security from preventing harmful actions to ensuring task alignment.",
    390       "evidence": "Section 3 formalizes task alignment with definitions of ContributesTo relation and task instruction alignment condition. The framework is distinct from existing rule-based, spotlighting, and authentication approaches.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Single run per task",
    397       "detail": "All results are from a single trial per task with no variance reporting. Given the stochastic nature of LLM outputs (even at temperature 0.0 due to batching effects), single-run results may not be stable."
    398     },
    399     {
    400       "flag": "No ablation study",
    401       "detail": "Task Shield has three distinct components (instruction extraction, alignment check, feedback generation) but no ablation study measures individual component contributions. It is unclear which components drive the defense effectiveness."
    402     },
    403     {
    404       "flag": "Single benchmark and model family",
    405       "detail": "Evaluation is limited to AgentDojo with GPT-4o/4o-mini/3.5-turbo. Generalization to other benchmarks (e.g., InjecAgent's full suite) and model families (Claude, Llama) is unknown."
    406     },
    407     {
    408       "flag": "No code released",
    409       "detail": "The Task Shield implementation is not released, making independent verification impossible. While prompts are provided in the appendix, the full pipeline cannot be reproduced."
    410     },
    411     {
    412       "flag": "Cost overhead not quantified",
    413       "detail": "Task Shield adds multiple LLM calls per message (instruction extraction + alignment checking for every assistant message and tool output). The computational and financial cost overhead is never quantified, despite the Limitations section acknowledging 'high cost of LLM queries.'"
    414     },
    415     {
    416       "flag": "LLM-based defense evaluated on LLM-based attacks",
    417       "detail": "Using the same model (GPT-4o) for both the Task Shield defense and the agent being defended creates a potential circular dependency. The paper does not discuss whether the defense's effectiveness is model-specific or generalizable."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Agentdojo: A dynamic environment to evaluate attacks and defenses for llm agents",
    423       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunović", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"],
    424       "year": 2024,
    425       "arxiv_id": "2406.13352",
    426       "relevance": "Primary evaluation benchmark for indirect prompt injection attacks and defenses in tool-integrated LLM agents."
    427     },
    428     {
    429       "title": "The instruction hierarchy: Training llms to prioritize privileged instructions",
    430       "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"],
    431       "year": 2024,
    432       "arxiv_id": "2404.13208",
    433       "relevance": "Training-time defense establishing instruction privilege hierarchy for LLMs, used as foundational concept in Task Shield's formalization."
    434     },
    435     {
    436       "title": "Struq: Defending against prompt injection with structured queries",
    437       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    438       "year": 2024,
    439       "arxiv_id": "2402.06363",
    440       "relevance": "Baseline defense method using data delimiting to isolate tool outputs from injected instructions."
    441     },
    442     {
    443       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    444       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    445       "year": 2023,
    446       "relevance": "Foundational work demonstrating indirect prompt injection attacks against real-world LLM-integrated applications."
    447     },
    448     {
    449       "title": "Injecagent: Benchmarking indirect prompt injections in tool-integrated large language model agents",
    450       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    451       "year": 2024,
    452       "arxiv_id": "2403.02691",
    453       "relevance": "Benchmark for indirect prompt injection in tool-integrated LLM agents; one of the three attack types evaluated."
    454     },
    455     {
    456       "title": "Defending against indirect prompt injection attacks with spotlighting",
    457       "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"],
    458       "year": 2024,
    459       "arxiv_id": "2403.14720",
    460       "relevance": "Defense method using source spotlighting to distinguish trusted from untrusted content in LLM inputs."
    461     },
    462     {
    463       "title": "Fath: Authentication-based test-time defense against indirect prompt injection attacks",
    464       "authors": ["Jiongxiao Wang", "Fangzhou Wu", "Wendi Li", "Jinsheng Pan", "Edward Suh", "Z. Morley Mao", "Muhao Chen", "Chaowei Xiao"],
    465       "year": 2024,
    466       "arxiv_id": "2410.21492",
    467       "relevance": "Test-time defense using hash-based authentication tags to filter harmful responses from indirect prompt injection."
    468     },
    469     {
    470       "title": "Jatmo: Prompt injection defense by task-specific finetuning",
    471       "authors": ["Julien Piet", "Maha Alrashed", "Chawin Sitawarin", "Sizhe Chen", "Zeming Wei", "Elizabeth Sun", "Basel Alomair", "David Wagner"],
    472       "year": 2023,
    473       "arxiv_id": "2312.17673",
    474       "relevance": "Training-time defense against prompt injection via task-specific model finetuning."
    475     },
    476     {
    477       "title": "Prompt injection attack against llm-integrated applications",
    478       "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li", "Kailong Wang"],
    479       "year": 2023,
    480       "arxiv_id": "2306.05499",
    481       "relevance": "Early characterization of prompt injection attacks against LLM-integrated applications."
    482     },
    483     {
    484       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    485       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    486       "year": 2024,
    487       "relevance": "Formal framework for benchmarking prompt injection attacks and defenses in various scenarios."
    488     },
    489     {
    490       "title": "Palisade – prompt injection detection framework",
    491       "authors": ["Sahasra Kokkula", "G Divya"],
    492       "year": 2024,
    493       "arxiv_id": "2410.21146",
    494       "relevance": "Prompt injection detection classifier used as a baseline defense in the evaluation."
    495     },
    496     {
    497       "title": "Instructional segment embedding: Improving llm safety with instruction hierarchy",
    498       "authors": ["Tong Wu", "Shujian Zhang", "Kaiqiang Song", "Silei Xu"],
    499       "year": 2024,
    500       "arxiv_id": "2410.09102",
    501       "relevance": "Training-time defense improving LLM safety through instruction hierarchy embeddings."
    502     },
    503     {
    504       "title": "GuardAgent: Safeguard llm agents by a guard agent via knowledge-enabled reasoning",
    505       "authors": ["Zhen Xiang", "Linzhi Zheng", "Yanjie Li", "Junyuan Hong"],
    506       "year": 2024,
    507       "arxiv_id": "2406.09187",
    508       "relevance": "Guard agent approach to safeguarding LLM agents using knowledge-enabled reasoning."
    509     },
    510     {
    511       "title": "Ignore previous prompt: Attack techniques for language models",
    512       "authors": ["Fábio Perez", "Ian Ribeiro"],
    513       "year": 2022,
    514       "arxiv_id": "2211.09527",
    515       "relevance": "Early work on prompt injection attack techniques; one of the three attack types evaluated in this paper."
    516     }
    517   ],
    518   "engagement_factors": {
    519     "practical_relevance": {
    520       "score": 2,
    521       "justification": "Task Shield is a practical test-time defense that could wrap existing LLM agents, but no code is released for immediate use."
    522     },
    523     "surprise_contrarian": {
    524       "score": 1,
    525       "justification": "The reframing from 'detect harm' to 'enforce task alignment' is a modest conceptual shift but not deeply contrarian."
    526     },
    527     "fear_safety": {
    528       "score": 2,
    529       "justification": "Demonstrates that without defense, indirect prompt injection succeeds 47.69% of the time on GPT-4o agents, raising real security concerns."
    530     },
    531     "drama_conflict": {
    532       "score": 0,
    533       "justification": "No controversy, no conflict with other researchers or companies."
    534     },
    535     "demo_ability": {
    536       "score": 0,
    537       "justification": "No code released, no demo available, no pip-installable tool."
    538     },
    539     "brand_recognition": {
    540       "score": 1,
    541       "justification": "Uses GPT-4o (well-known model) but authors are from Penn State, Princeton, and CSULB — not a famous AI lab."
    542     }
    543   }
    544 }

Impressum · Datenschutz