ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27142B)


      1 {
      2   "paper": {
      3     "title": "Agent Security Bench (ASB): Formalizing and Benchmarking Attacks and Defenses in LLM-Based Agents",
      4     "authors": [
      5       "Hanrong Zhang",
      6       "Jingyuan Huang",
      7       "Kai Mei",
      8       "Yifei Yao",
      9       "Zhenting Wang",
     10       "Chenlu Zhan",
     11       "Hongwei Wang",
     12       "Yongfeng Zhang"
     13     ],
     14     "year": 2024,
     15     "venue": "ICLR 2025",
     16     "arxiv_id": "2410.02644"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract and Appendix F (Reproducibility Statement) state that source code is available at https://github.com/agiresearch/ASB, including attack scripts, configuration files, and Docker setup."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The benchmark data (tools in JSONL format, attack/normal tool definitions) is included in the GitHub repository (data/all_normal_tools.jsonl and data/all_attack_tools.jsonl referenced in Appendix C.2.6)."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Appendix F states that environment setup is provided via requirements.txt for GPU/non-GPU systems, and Conda or Docker installation instructions are included in the repository."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Appendix F describes predefined attack scripts (e.g., scripts/agent_attack.py), YAML configuration files in config/, and step-by-step instructions for setting up API keys and running experiments."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results are reported as point estimates (percentages) only. No confidence intervals, error bars, or uncertainty measures appear anywhere in the paper's tables."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper makes comparative claims throughout (e.g., 'Mixed Attack is the most impactful', 'DPI is more effective than IPI') but uses no statistical significance tests — only comparisons of raw percentages."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Effect sizes are not reported. Results are raw ASR percentages; no standardized effect size measures (Cohen's d, etc.) are used, and there is no quantification of practical significance beyond the raw numbers."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The benchmark uses 50 agent tasks and 400 attack tasks (Tab. 3), but no justification is provided for why these quantities were chosen or whether they provide sufficient statistical power."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or spread measures are reported across runs. All results appear to be single-run numbers; it is unclear whether experiments were repeated or how stable the results are."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper compares ASB against InjecAgent and AgentDojo in Table 12, showing that ASB covers more attack/defense types and scenarios. Individual attack and defense results are compared across 13 LLM backbones."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The two main comparison benchmarks (InjecAgent, Zhan et al. 2024; AgentDojo, Debenedetti et al. 2024) are contemporary. The defense methods used are recent (2022-2024) and represent the current state of practice."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Appendix D.1.2 analyzes different attack combinations (DPI+IPI, DPI+MP, IPI+MP, full mixed) systematically, and D.1.3 tests different backdoor triggers — these function as ablations of attack components."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Seven metrics are defined and used: ASR, Refuse Rate, PNA, BP, FNR, FPR, and the novel NRP metric (Table 4), providing multiple angles on agent security and utility."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "The evaluation is entirely automated — tool invocation is checked programmatically via simulated API calls. Human evaluation of outputs is clearly irrelevant for this kind of benchmark."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The PoT backdoor testing uses tasks 'different from those in the PoT demonstration to ensure the independence and diversity of the experimental results' (Appendix C.2.2). No prompt tuning on the test set is described."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down by attack type (DPI, IPI, Memory Poisoning, PoT Backdoor, Mixed) and by LLM backbone across Tables 5-8 and 14-21, providing granular per-category analysis."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Appendix D.2 provides detailed analysis of why each defense fails (Sec. D.2.1 for DPI/IPI, D.2.2 for PoT, D.2.3 for memory attacks), including qualitative reasoning about the failure mechanisms."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The core finding is that current defenses are largely ineffective (Sec. 5.4, Tables 7-8, D.2). The paper explicitly reports that defenses reduce ASR only modestly and often cause utility losses."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims (highest average ASR 84.30%, limited defense effectiveness, 10 scenarios, 400+ tools, 27 attack/defense methods, 7 metrics, 13 LLM backbones) are all directly supported by Table 5 and Table 3."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper makes causal language claims (e.g., 'better agents with stronger backbone LLMs initially exhibit higher ASR due to their superior ability to follow instructions') based only on correlational patterns in Fig. 2 — no controlled experiments isolate this causal mechanism. The ablation combinations also do not isolate single variables cleanly."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper presents conclusions about 'LLM-based agents' broadly, but all evaluations use synthetic/simulated task environments with AI-generated tools and tasks. Claims about real-world agent vulnerability outrun the simulated evaluation setting."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No threats-to-validity or limitations section exists. The paper does not consider alternative explanations for results, such as whether the synthetic benchmark tasks and AI-generated tools reflect realistic attack conditions."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Models are referred to by marketing names only: 'Claude-3.5 Sonnet', 'GPT-4o', 'GPT-4o-mini', 'GPT-3.5 Turbo'. No snapshot dates or API version strings (e.g., gpt-4o-2024-08-06) are provided. Model behavior changes across versions."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Appendix C.2.4 provides the full actual text of prompts used: the system prompt for LLM-based agents, prompts for paraphrasing defense, instructional prevention, sandwich prevention, dynamic prompt rewriting, refusal judgment, and PoT backdoor trigger generation."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported anywhere in the paper. These settings significantly affect stochastic outputs and are necessary for reproducibility."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The ReAct framework is described in Appendix A.2 with formal notation for the agent's context and action space. The AIOS-based implementation details (Langchain, Chroma vector database, tool simulation) are provided in Appendix C.2.2 and C.2.6."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Appendix B describes how benchmark data was generated using GPT-4: agent descriptions, user tasks, standard plans, tool definitions, and attack tools — all generation processes are described with examples in Tables 9-11."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations section. The paper has an ethics statement (Appendix E) and a reproducibility statement (Appendix F) but no limitations or threats-to-validity section."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats-to-validity discussion exists. Specific concerns such as the use of simulated (not real) tool calls, AI-generated tasks, or the validity of ASR as a proxy for real-world harm are not addressed."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do not show. For example, it does not acknowledge that simulated tool calls may not reflect real API behavior, or that results on AI-generated tasks may not transfer to real deployments."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Raw data (tool definitions, attack/defense configurations) is available via the GitHub repository (data/all_normal_tools.jsonl and data/all_attack_tools.jsonl), allowing verification of benchmark construction."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Appendix B describes in detail how all benchmark data was generated: 10 scenarios × 5 tasks using GPT-4, with normal tools and attack tools generated following specified schema (tool name, description, expected achievement, corresponding agent)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants were involved. The benchmark data is AI-generated (GPT-4). This question is not applicable."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The full pipeline from task generation (GPT-4) through tool simulation (JSONL files → SimulatedTool/AttackerTool objects) to evaluation (tool invocation checking against expected output) is documented in Appendices B and C.2."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding acknowledgment appears anywhere in the paper. There is no acknowledgments section listing grants, institutional support, or corporate sponsors."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly stated on the title page: Hanrong Zhang, Yifei Yao, Chenlu Zhan, and Hongwei Wang are from Zhejiang University; Jingyuan Huang, Kai Mei, Zhenting Wang, and Yongfeng Zhang are from Rutgers University."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding source is disclosed. The schema says 'NA if unfunded,' but the absence of a funding disclosure does not confirm the work is unfunded -- it may simply be undisclosed. Since we cannot confirm unfunded status, the criterion applies but cannot be satisfied."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement appears in the paper. The absence of such a declaration means financial interests cannot be ruled out."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "Training data cutoffs are not stated for any of the 13 LLM backbones tested. Although the tasks evaluate tool-calling behavior rather than factual recall, contamination of the AI-generated benchmark through training data remains a concern."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No analysis of potential train/test overlap is provided. Since the benchmark tasks were generated by GPT-4o and GPT-4o-mini is one of the tested models, there is a risk that the task structure reflects GPT-4's priors, but this is not discussed."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The benchmark is newly constructed (2024) and the tasks are AI-generated, so direct contamination of the tasks is unlikely. However, the paper uses the prompt injection patterns from public sources (OWASP 2023, Liu et al. 2024) that models may have been trained on, and this is not discussed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants were involved. The ethics statement (Appendix E) explicitly states 'No human subjects were involved in this study.'"
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants were involved per the ethics statement (Appendix E)."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants were involved per the ethics statement (Appendix E)."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants were involved per the ethics statement (Appendix E)."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants were involved per the ethics statement (Appendix E)."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants were involved per the ethics statement (Appendix E)."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants were involved per the ethics statement (Appendix E)."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference costs, API costs, or latency figures are reported. The paper tests 13 LLM backbones across hundreds of attack configurations, which would involve substantial API costs (especially for closed-source models), but these are not quantified."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "Total computational budget (GPU hours, API costs, total API calls) is not stated anywhere in the paper. Given the scale of experiments (13 LLMs × multiple attack types × hundreds of tasks), this is a notable omission."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "The highest average attack success rate across all attack types is 84.30%, achieved by the Mixed Attack.",
    295       "evidence": "Table 5 shows Mixed Attack achieves 84.30% average ASR across 13 LLM backbones. Table 14 also confirms this figure.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "Current defenses against prompt injection and backdoor attacks are largely ineffective.",
    300       "evidence": "Tables 7-8 show defense methods reduce DPI ASR from 72.68% to at best 44.45% (Dynamic Prompt Rewriting) and IPI by only 3%; Appendix D.2 details failure analysis.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "Better agents with stronger backbone LLMs initially exhibit higher ASR due to their superior ability to follow instructions.",
    305       "evidence": "Figure 2(c) shows a rise-then-fall pattern, and Section 5.3 discusses GPT-4o (60.35% ASR with 20.05% refusal) vs GPT-3.5 Turbo (98.40% ASR, 3% refusal). This is a correlational observation.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "PoT backdoor attacks achieve near-perfect ASR (~99.90%) while maintaining similar benign performance to non-backdoored agents.",
    310       "evidence": "Table 15 shows ASR of 99.75-100% across all five tested triggers; Table 16 shows BP and PNA values are closely matched per backbone.",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "The NRP metric effectively identifies agents that balance utility and security.",
    315       "evidence": "Table 6 shows NRP scores with Claude-3.5 Sonnet (43.56%) and LLaMA3-70B (30.03%) as top performers, but there is no external validation of NRP's predictive validity or comparison to alternative composite metrics.",
    316       "supported": "weak"
    317     },
    318     {
    319       "claim": "Agent performance is generally weaker than LLM leaderboard quality.",
    320       "evidence": "Figure 2(b) shows most models fall below the y=x line when plotting LLM leaderboard quality vs PNA. Claude-3.5 Sonnet, LLaMA3-70B, and GPT-4o are exceptions.",
    321       "supported": "moderate"
    322     }
    323   ],
    324   "methodology_tags": [
    325     "benchmark-eval"
    326   ],
    327   "key_findings": "Agent Security Bench (ASB) introduces a comprehensive framework evaluating 13 LLM backbones against 5 attack types and 11 defenses across 10 application scenarios. Mixed attacks combining DPI, IPI, and memory poisoning achieve the highest average attack success rate of 84.30%. Current prevention-based defenses are largely ineffective — the best defense (Dynamic Prompt Rewriting) still leaves 44.45% DPI ASR — while detection-based defenses for memory attacks fail to identify 66% of attacks (FNR=0.660). The paper introduces the NRP metric as a composite measure of utility and adversarial resilience.",
    328   "red_flags": [
    329     {
    330       "flag": "No model version strings",
    331       "detail": "All 13 LLM backbones are identified by marketing name only (e.g., 'Claude-3.5 Sonnet', 'GPT-4o', 'GPT-3.5 Turbo') without snapshot dates or API version IDs. These models change frequently and results cannot be precisely reproduced without version pinning."
    332     },
    333     {
    334       "flag": "No statistical uncertainty quantification",
    335       "detail": "All attack success rates and performance metrics are reported as single point estimates with no confidence intervals, error bars, or repeated-run variance. It is unknown how stable these percentages are across different random seeds or orderings."
    336     },
    337     {
    338       "flag": "AI-generated benchmark tasks and tools",
    339       "detail": "All 50 agent tasks, 400+ attack tools, and their descriptions were generated using GPT-4 (Appendix B). This creates circularity: some tested models (GPT-4o, GPT-4o-mini) are variants of the model used to generate the benchmark, which may advantage or disadvantage them in ways not analyzed."
    340     },
    341     {
    342       "flag": "Simulated tool calls, not real APIs",
    343       "detail": "All evaluations use pre-scripted simulated tool responses that return fixed strings (Appendix C.2.1). Real-world attack success may differ substantially when actual API behavior, error handling, and rate limiting are involved."
    344     },
    345     {
    346       "flag": "No limitations section",
    347       "detail": "The paper contains no dedicated limitations or threats-to-validity section. Key scope boundaries — such as the simulation-only evaluation, AI-generated tasks, or the restriction to tool-invocation as the sole success criterion — are not explicitly acknowledged as limitations."
    348     },
    349     {
    350       "flag": "No funding disclosure",
    351       "detail": "No funding acknowledgment appears anywhere in the paper, making it impossible to assess potential conflicts of interest."
    352     },
    353     {
    354       "flag": "Causal claims from correlational data",
    355       "detail": "Section 5.3 makes causal claims about why stronger LLMs have higher ASR (e.g., 'due to their superior ability to follow instructions'), but these are inferred from correlational plots in Figure 2 with no controlled experiments isolating the mechanism."
    356     }
    357   ],
    358   "cited_papers": [
    359     {
    360       "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents",
    361       "authors": [
    362         "Qiusi Zhan",
    363         "Zhixiang Liang",
    364         "Zifan Ying",
    365         "Daniel Kang"
    366       ],
    367       "year": 2024,
    368       "doi": "10.18653/v1/2024.findings-acl.624",
    369       "relevance": "Direct predecessor benchmark for indirect prompt injection in LLM agents, compared against ASB in the paper."
    370     },
    371     {
    372       "title": "AgentDojo: A Dynamic Environment to Evaluate Attacks and Defenses for LLM Agents",
    373       "authors": [
    374         "Edoardo Debenedetti",
    375         "Jie Zhang",
    376         "Mislav Balunovic",
    377         "Luca Beurer-Kellner",
    378         "Marc Fischer",
    379         "Florian Tramer"
    380       ],
    381       "year": 2024,
    382       "relevance": "A competing agent security benchmark evaluated against ASB; covers IPI attacks and defenses in limited scenarios."
    383     },
    384     {
    385       "title": "AgentPoison: Red-teaming LLM Agents via Poisoning Memory or Knowledge Bases",
    386       "authors": [
    387         "Zhaorun Chen",
    388         "Zhen Xiang",
    389         "Chaowei Xiao",
    390         "Dawn Song",
    391         "Bo Li"
    392       ],
    393       "year": 2024,
    394       "arxiv_id": "2407.12784",
    395       "relevance": "Prior work on memory poisoning in LLM agents, directly related to the memory poisoning attack category in ASB."
    396     },
    397     {
    398       "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses",
    399       "authors": [
    400         "Yupei Liu",
    401         "Yuqi Jia",
    402         "Runpeng Geng",
    403         "Jinyuan Jia",
    404         "Neil Zhenqiang Gong"
    405       ],
    406       "year": 2024,
    407       "relevance": "Formalizes prompt injection attacks; ASB builds on this formalization and extends it to the full LLM agent framework."
    408     },
    409     {
    410       "title": "BadChain: Backdoor Chain-of-Thought Prompting for Large Language Models",
    411       "authors": [
    412         "Zhen Xiang",
    413         "Fengqing Jiang",
    414         "Zidi Xiong",
    415         "Bhaskar Ramasubramanian",
    416         "Radha Poovendran",
    417         "Bo Li"
    418       ],
    419       "year": 2024,
    420       "relevance": "Proposes backdoor attacks on chain-of-thought prompting; the ASB PoT backdoor attack is directly inspired by and extends BadChain to agentic settings."
    421     },
    422     {
    423       "title": "BadAgent: Inserting and Activating Backdoor Attacks in LLM Agents",
    424       "authors": [
    425         "Yifei Wang",
    426         "Dizhan Xue",
    427         "Shengjie Zhang",
    428         "Shengsheng Qian"
    429       ],
    430       "year": 2024,
    431       "arxiv_id": "2406.03007",
    432       "relevance": "Concurrent work on backdoor attacks specifically targeting LLM agents via task data contamination."
    433     },
    434     {
    435       "title": "Watch Out for Your Agents! Investigating Backdoor Threats to LLM-based Agents",
    436       "authors": [
    437         "Wenkai Yang",
    438         "Xiaohan Bi",
    439         "Yankai Lin",
    440         "Sishuo Chen",
    441         "Jie Zhou",
    442         "Xu Sun"
    443       ],
    444       "year": 2024,
    445       "arxiv_id": "2402.11208",
    446       "relevance": "Investigates backdoor threats to LLM agents via fine-tuning contamination, complementary approach to the training-free PoT attack in ASB."
    447     },
    448     {
    449       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    450       "authors": [
    451         "Kai Greshake",
    452         "Sahar Abdelnabi",
    453         "Shailesh Mishra",
    454         "Christoph Endres",
    455         "Thorsten Holz",
    456         "Mario Fritz"
    457       ],
    458       "year": 2023,
    459       "relevance": "Seminal work demonstrating indirect prompt injection attacks against real-world LLM-integrated applications."
    460     },
    461     {
    462       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    463       "authors": [
    464         "Shunyu Yao",
    465         "Jeffrey Zhao",
    466         "Dian Yu",
    467         "Nan Du",
    468         "Izhak Shafran",
    469         "Karthik Narasimhan",
    470         "Yuan Cao"
    471       ],
    472       "year": 2022,
    473       "arxiv_id": "2210.03629",
    474       "relevance": "Defines the ReAct agent framework that ASB uses as its baseline agent architecture for all experiments."
    475     },
    476     {
    477       "title": "DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models",
    478       "authors": [
    479         "Boxin Wang",
    480         "Weixin Chen",
    481         "Hengzhi Pei"
    482       ],
    483       "year": 2023,
    484       "relevance": "Related work on comprehensive trustworthiness evaluation of LLMs, providing context for the security-focused evaluation in ASB."
    485     },
    486     {
    487       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    488       "authors": [
    489         "Evan Hubinger",
    490         "Carson Denison",
    491         "Jesse Mu"
    492       ],
    493       "year": 2024,
    494       "arxiv_id": "2401.05566",
    495       "relevance": "Related work on persistent backdoor behaviors in LLMs that survive safety training, relevant to the backdoor attack category in ASB."
    496     },
    497     {
    498       "title": "TrustAgent: Towards Safe and Trustworthy LLM-based Agents through Agent Constitution",
    499       "authors": [
    500         "Wenyue Hua",
    501         "Xianjun Yang",
    502         "Cheng Wei",
    503         "Ruixiang Tang",
    504         "Yongfeng Zhang"
    505       ],
    506       "year": 2024,
    507       "relevance": "Directly relevant work on building safe and trustworthy LLM agents, from the same research group as ASB."
    508     },
    509     {
    510       "title": "Baseline Defenses for Adversarial Attacks against Aligned Language Models",
    511       "authors": [
    512         "Neel Jain",
    513         "Avi Schwarzschild",
    514         "Yuxin Wen"
    515       ],
    516       "year": 2023,
    517       "relevance": "Provides baseline defense methods (paraphrasing, perplexity detection) that ASB adopts and evaluates in its defense benchmarking."
    518     }
    519   ]
    520 }

Impressum · Datenschutz