scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27789B)
      1 {
      2   "paper": {
      3     "title": "Adaptive Attacks Break Defenses Against Indirect Prompt Injection Attacks on LLM Agents",
      4     "authors": [
      5       "Qiusi Zhan",
      6       "Richard Fang",
      7       "Henil Shalin Panchal",
      8       "Daniel Kang"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv",
     12     "arxiv_id": "2503.00061"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The abstract states 'The code is available at https://github.com/uiuc-kang-lab/AdaptiveAttackAgent', providing a working GitHub repository URL."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses the publicly available InjecAgent benchmark (Zhan et al., 2024) which was published at ACL 2024 and is publicly accessible. No proprietary data was created."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Appendix A provides hyperparameters (Table 5) and mentions training on a single NVIDIA A100 GPU, but no requirements.txt, Dockerfile, or explicit library versions are mentioned in the paper. The hardware is specified but software environment dependencies are not."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "While code is available on GitHub, the paper itself does not include step-by-step reproduction instructions. Appendix A provides hyperparameters but not a complete reproduction workflow that a reader could follow from the paper alone."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Table 4 reports mean ASRs with standard deviation (±) for different attack types and stages (e.g., 'Direct Harm Vicuna-7B: 0.90 ± 0.06'), providing uncertainty quantification over defenses."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests (t-tests, p-values, etc.) are reported. The paper compares ASRs between conditions but relies only on descriptive statistics without formal hypothesis testing."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports ASR values before and after applying adaptive attacks (e.g., from 12% with adversarial finetuning to >50% after adaptive attack), providing concrete magnitude of improvement in context."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The evaluation uses 100 test cases (50 from each attack type), randomly selected from 1,054 cases in the InjecAgent benchmark. No power analysis or justification for why 100 cases is sufficient to support the conclusions is provided."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Table 4 reports standard deviations (±) for ASR-defense and ASR-adaptive attack averaged across defenses (e.g., '0.39 ± 0.19'). The variance is reported across the 8 defenses, providing spread information."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares ASRs against original (no defense, no adaptive attack) attacks as a baseline (shown in gray in Figure 2), and also compares adaptive vs. non-adaptive adversarial strings (Table 3), providing meaningful comparisons."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The defenses evaluated include recent work from 2023-2024, including perplexity filtering (Jain et al., 2023), adversarial finetuning (Piet et al., 2024), and others. The baselines are from the current literature on IPI defenses."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 3 presents a comparison of adaptive vs. non-adaptive adversarial strings for perplexity filtering and paraphrasing defenses, effectively ablating the adaptive component. Cross-evaluation in Figure 5 shows attack specificity."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports multiple metrics: ASR-defense, ASR-adaptive attack, target rate (Section 5.1), detection rate (Table 2), and valid rate (Figure 4/Section 6.2), providing a multi-dimensional evaluation."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is a technical security evaluation paper testing automated attack and defense systems. Human evaluation is clearly irrelevant to the claims being made about attack success rates."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper uses InjecAgent as the evaluation benchmark. The adversarial strings are trained (optimized) and then evaluated on the 100 test cases. For adversarial finetuning, training data consists of unsuccessful attacks (215/816 cases), distinct from the test evaluation."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 4 provides detailed breakdown by attack type (direct harm vs. data stealing step 1 vs. step 2) and by agent type (Vicuna-7B vs. Llama3-8B). Figure 2 breaks down results by individual defense. This is thorough categorical analysis."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6.3 explicitly discusses why adaptive attacks provide minimal improvement in the second step of data-stealing attacks, identifying a specific failure mode. The limitations section also discusses cases where AutoDAN struggles with the IPI setting."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that the genetic-algorithm-based AutoDAN 'struggles to adapt to the IPI setting' (Section 4) and that adaptive attacks are 'not as effective' on AgentDojo benchmark vs. InjecAgent (Appendix B), acknowledging where approaches fail."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 'consistently achieving an attack success rate of over 50%' against all eight defenses, and Figure 2 confirms all ASR-adaptive attack values exceed 50% for both agents and defenses evaluated."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper's causal claim is that adaptive attacks cause high ASRs against defenses. This is tested through controlled experiments where each adaptive attack is specifically designed for and evaluated against each defense, with the adversarial strings being the manipulated variable. The cross-evaluation (Figure 5) further supports causality by showing defense-specific attacks outperform general attacks."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper evaluates only two models (Vicuna-7B and Llama3-8B), a subset of 100 InjecAgent cases, and assumes white-box attacker access. The conclusion that 'adaptive attacks can bypass all defenses' makes broad generalizations without adequately bounding these to the specific models, access assumptions, and benchmark tested."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for why adaptive attacks succeed (e.g., whether the models are simply too weak, or whether the benchmark is unrepresentative). The limitations section notes white-box assumption and small defense subset but does not consider alternative interpretations of the findings."
    131       }
    132     },
    133     "setup_transparency": {
    134       "model_versions_specified": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper specifies Vicuna-7B v1.5 (with HuggingFace URL), Llama3-8B-Instruct (with HuggingFace URL), and gpt-4-0613 for simulating tool responses in data stealing evaluation. Specific model version IDs are provided."
    138       },
    139       "prompts_provided": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Appendix C provides full text of all prompts used: the ReAct agent prompt (C.1), LLM-based detector prompt (C.2), instructional prevention prompt (C.3), sandwich prevention prompt (C.4), and paraphrasing prompt (C.5). These are complete, not templates."
    143       },
    144       "hyperparameters_reported": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Table 5 (Appendix A) provides detailed hyperparameters: attack string position, token length, training steps for each defense/agent combination. LoRA hyperparameters (rank, alpha, dropout, learning rate, epochs) are also stated for adversarial finetuning."
    148       },
    149       "scaffolding_described": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The paper describes the full agent setup including the ReAct framework (with prompt in Appendix C.1), tool integration, how the agent processes observations, and the evaluation procedure following InjecAgent. The agentic scaffolding is clearly described."
    153       },
    154       "data_preprocessing_documented": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 5.1 describes the dataset selection: 'we randomly select 50 test cases from each attack type, creating a subset of 100 test cases.' For adversarial finetuning, the pipeline is described: 'we first evaluate all the test cases and filter out successful attacks and invalid outputs, keeping only the unsuccessful attacks' (215 for Vicuna-7B, 816 for Llama3-8B)."
    158       }
    159     },
    160     "limitations_and_scope": {
    161       "limitations_section_present": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 10 is titled 'Limitations' and provides a bulleted list of four specific limitations of the work."
    165       },
    166       "threats_to_validity_specific": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The limitations section identifies specific threats: (1) attacks focus only on first-step agent action, limiting effectiveness in multi-step attacks; (2) white-box access assumption; (3) no combination of defenses tested; (4) not exhaustive coverage of all possible defenses. These are specific to this study."
    170       },
    171       "scope_boundaries_stated": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The limitations explicitly state the paper 'does not account for the combination of defenses' and does not cover certain approaches like LLM self-evaluation. The AgentDojo appendix also notes the approach is less effective in long-context settings, bounding the scope."
    175       }
    176     },
    177     "data_integrity": {
    178       "raw_data_available": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The paper uses the publicly available InjecAgent benchmark. The code is released at GitHub, which should enable independent verification of results. The evaluation process and specific subset selection (random 50 cases per type) is documented."
    182       },
    183       "data_collection_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 5.1 describes the dataset clearly: InjecAgent benchmark with 1,054 test cases covering two attack types. The random sampling of 50 per attack type is stated, yielding 100 test cases with 25 unique direct harm attacks, 27 data stealing attacks, and 16 user scenarios."
    187       },
    188       "recruitment_methods_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "There are no human participants in this study. The paper uses a benchmark dataset of automated test cases, not human recruits."
    192       },
    193       "data_pipeline_documented": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The evaluation pipeline is documented: adversarial strings are trained on individual test cases, then evaluated. For adversarial finetuning, the pipeline from training data creation (filtering unsuccessful attacks) through finetuning to evaluation is described in Section 3.3 and Appendix A."
    197       }
    198     },
    199     "conflicts_of_interest": {
    200       "funding_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The acknowledgements state: 'We would like to acknowledge the Open Philanthropy project for funding this research in part.' Funding source is disclosed."
    204       },
    205       "affiliations_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Author affiliations are clearly stated: Qiusi Zhan, Richard Fang, and Daniel Kang are at University of Illinois Urbana-Champaign; Henil Shalin Panchal is at Nirma University. No evaluated commercial products are affiliated with the authors."
    209       },
    210       "funder_independent_of_outcome": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Open Philanthropy is a philanthropic organization with no commercial stake in whether defenses against IPI attacks are effective or not. The funder appears independent of the outcome."
    214       },
    215       "financial_interests_declared": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No competing interests statement is present in the paper. While no obvious conflict exists, the absence of an explicit declaration means this criterion is not satisfied."
    219       }
    220     },
    221     "contamination": {
    222       "training_cutoff_stated": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "The schema states applies=false (NA) when 'the paper does not evaluate a pre-trained model's capability on any benchmark' and gives the example 'studies that test defenses/tools rather than model knowledge.' This paper tests defense robustness against adaptive IPI attacks, not model knowledge or capability. The ASR measures whether the model follows injected instructions under various defense conditions, not whether the model 'knows' benchmark answers. Contamination (memorization of benchmark content) is not the relevant concern here."
    226       },
    227       "train_test_overlap_discussed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Same reasoning as training_cutoff_stated. The schema explicitly exempts 'studies that test defenses/tools rather than model knowledge' from contamination questions. This paper evaluates defense mechanisms against adaptive attacks, not the model's inherent capability on benchmark tasks. Whether the models saw InjecAgent during training does not invalidate the defense evaluation since the attacks are adaptively optimized per test case regardless."
    231       },
    232       "benchmark_contamination_addressed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same reasoning as the other contamination questions. The schema says NA applies when 'the paper does not evaluate a pre-trained model on any benchmark (same NA rule as training_cutoff_stated).' This paper tests adaptive attacks against defenses, not model knowledge. The relevant evaluation is whether defense mechanisms can withstand adversarial attacks, which is independent of whether the models have seen the specific benchmark examples. Per-example adversarial optimization makes memorization irrelevant to attack effectiveness."
    236       }
    237     },
    238     "human_studies": {
    239       "pre_registered": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study; it is a purely automated technical evaluation of attack and defense systems."
    243       },
    244       "irb_or_ethics_approval": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. Section 9 discusses ethical considerations regarding dual-use research but no IRB approval is required or mentioned."
    248       },
    249       "demographics_reported": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "inclusion_exclusion_criteria": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "randomization_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "blinding_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "attrition_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       }
    274     },
    275     "cost_and_practicality": {
    276       "inference_cost_reported": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "The paper states that each adversarial string is trained 'on a single NVIDIA A100 GPU for approximately 30 minutes' (Appendix A), but API costs for gpt-4-0613 calls in data stealing evaluation and total inference cost per evaluation example are not reported."
    280       },
    281       "compute_budget_stated": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Appendix A states that 'We train each adversarial string on a single NVIDIA A100 GPU for approximately 30 minutes.' The training steps per defense are given in Table 5, providing reasonable compute budget information."
    285       }
    286     }
    287   },
    288   "claims": [
    289     {
    290       "claim": "Adaptive attacks achieve over 50% attack success rate against all eight evaluated defenses for both agent types.",
    291       "evidence": "Figure 2 shows ASR-adaptive attack values for all defense/agent combinations. Table 4 reports ASRs with standard deviations: e.g., direct harm Vicuna-7B achieves 0.90 ± 0.06, Llama3-8B total achieves 0.57 ± 0.17.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Defense-specific adaptive attacks significantly outperform non-adaptive attacks and attacks trained for other defenses.",
    296       "evidence": "Figure 5 (cross-evaluation) shows that for each defense, the attack specifically trained against it achieves the highest ASR in most cases. Table 3 compares adaptive vs. non-adaptive adversarial strings for perplexity filtering (24% vs. 76%) and paraphrasing (53% vs. 79%).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Fine-tuned agents (Llama3-8B) are more resilient to IPI attacks than prompted agents (Vicuna-7B) under default conditions.",
    301       "evidence": "Figure 2 shows original ASR of 9% for Llama3-8B vs. 56% for Vicuna-7B (no defense). This is attributed to the finetuned agent's 'predefined conversational structure and clear separation between tool responses and context.'",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Adaptive attacks effectively bypass detection-based defenses by driving detection rates to nearly zero.",
    306       "evidence": "Table 2 shows detection rates drop from 61% (fine-tuned detector) and 34-72% (LLM-based detector) to 0-10% after applying M-GCG adaptive attacks. The stealth objective in multi-objective training 'is well fulfilled.'",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Adaptive attacks are less effective in long-context settings (AgentDojo benchmark) than short-context settings (InjecAgent).",
    311       "evidence": "Appendix B Table 6 shows adaptive attacks on AgentDojo do not exceed 50% ASR for all defenses (e.g., Multi-objective GCG achieves only 31.25%), contrasted with InjecAgent where all exceed 50%. Average token length is 3,823 for AgentDojo vs. 1,033 for InjecAgent.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": [
    316     "benchmark-eval"
    317   ],
    318   "key_findings": "The paper demonstrates that eight different defenses against indirect prompt injection (IPI) attacks on LLM agents can all be bypassed using adaptive attacks, consistently achieving over 50% attack success rates. Defense-specific adaptive attacks significantly outperform non-adaptive attacks, confirming the necessity of adaptive attack evaluation when assessing defense robustness. Fine-tuned agents show greater baseline resilience than prompted agents, but adaptive attacks remain highly effective against both. Adaptive attacks are less effective in long-context scenarios, indicating context length as a factor in defense robustness.",
    319   "red_flags": [
    320     {
    321       "flag": "Small evaluation subset",
    322       "detail": "Only 100 test cases are used from the 1,054-case InjecAgent benchmark (roughly 9.5%), with 50 randomly sampled from each of two attack types. No justification or power analysis is provided for why 100 cases is sufficient to support broad conclusions about all defenses being bypassable."
    323     },
    324     {
    325       "flag": "White-box attack assumption not representative",
    326       "detail": "All adaptive attacks assume the attacker has full white-box access to the agent, defense models, and detailed prompts. The limitations section acknowledges this, but the paper's conclusions are presented broadly without adequately foregrounding that real-world attackers typically have black-box or grey-box access at best."
    327     },
    328     {
    329       "flag": "No contamination analysis",
    330       "detail": "The InjecAgent benchmark was published in 2024, and Llama-3.1-8B-Instruct (also 2024) may have been trained on data containing it. No contamination analysis is provided, which could affect the baseline ASR measurements."
    331     },
    332     {
    333       "flag": "Overgeneralized conclusion from two models",
    334       "detail": "Conclusions about defense robustness are drawn from only two relatively small models (Vicuna-7B and Llama3-8B). Whether findings generalize to larger, more capable frontier models (GPT-4, Claude, Gemini) is not tested, yet the framing is broad."
    335     }
    336   ],
    337   "cited_papers": [
    338     {
    339       "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents",
    340       "authors": [
    341         "Qiusi Zhan",
    342         "Zhixiang Liang",
    343         "Zifan Ying",
    344         "Daniel Kang"
    345       ],
    346       "year": 2024,
    347       "relevance": "The primary benchmark used for evaluation in this paper, directly relevant as a benchmark for LLM agent security evaluation."
    348     },
    349     {
    350       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    351       "authors": [
    352         "Andy Zou",
    353         "Zifan Wang",
    354         "J. Zico Kolter",
    355         "Matt Fredrikson"
    356       ],
    357       "year": 2023,
    358       "arxiv_id": "2307.15043",
    359       "relevance": "Introduces the GCG attack algorithm that this paper adapts for IPI attacks against LLM agent defenses."
    360     },
    361     {
    362       "title": "Baseline Defenses for Adversarial Attacks Against Aligned Language Models",
    363       "authors": [
    364         "Neel Jain",
    365         "Avi Schwarzschild",
    366         "Yuxin Wen",
    367         "Gowthami Somepalli",
    368         "John Kirchenbauer",
    369         "Ping-yeh Chiang",
    370         "Micah Goldblum",
    371         "Aniruddha Saha",
    372         "Jonas Geiping",
    373         "Tom Goldstein"
    374       ],
    375       "year": 2023,
    376       "arxiv_id": "2309.00614",
    377       "relevance": "Provides perplexity filtering and paraphrasing defenses that this paper adapts and evaluates for IPI attack defense."
    378     },
    379     {
    380       "title": "AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models",
    381       "authors": [
    382         "Xiaogeng Liu",
    383         "Nan Xu",
    384         "Muhao Chen",
    385         "Chaowei Xiao"
    386       ],
    387       "year": 2024,
    388       "relevance": "Introduces AutoDAN attack method that this paper adapts for IPI adaptive attacks against perplexity filtering defenses."
    389     },
    390     {
    391       "title": "Obfuscated Gradients Give a False Sense of Security: Circumventing Defenses to Adversarial Examples",
    392       "authors": [
    393         "Anish Athalye",
    394         "Nicholas Carlini",
    395         "David A. Wagner"
    396       ],
    397       "year": 2018,
    398       "relevance": "Foundational work on adaptive attacks in adversarial ML that motivates this paper's evaluation methodology for IPI defense robustness."
    399     },
    400     {
    401       "title": "Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models",
    402       "authors": [
    403         "Jingwei Yi",
    404         "Yueqi Xie",
    405         "Bin Zhu",
    406         "Keegan Hines",
    407         "Emre Kiciman",
    408         "Guangzhong Sun",
    409         "Xing Xie",
    410         "Fangzhao Wu"
    411       ],
    412       "year": 2023,
    413       "arxiv_id": "2312.14197",
    414       "relevance": "Prior work on benchmarking and defending IPI attacks that this paper builds upon."
    415     },
    416     {
    417       "title": "AgentDojo: A Dynamic Environment to Evaluate Attacks and Defenses for LLM Agents",
    418       "authors": [
    419         "Edoardo Debenedetti",
    420         "Jie Zhang",
    421         "Mislav Balunovic",
    422         "Luca Beurer-Kellner",
    423         "Marc Fischer",
    424         "Florian Tramer"
    425       ],
    426       "year": 2024,
    427       "arxiv_id": "2406.13352",
    428       "relevance": "Secondary benchmark used in this paper's Appendix B to validate adaptive attack effectiveness in a different agentic evaluation environment."
    429     },
    430     {
    431       "title": "HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal",
    432       "authors": [
    433         "Mantas Mazeika",
    434         "Long Phan",
    435         "Xuwang Yin",
    436         "Andy Zou"
    437       ],
    438       "year": 2024,
    439       "relevance": "Related evaluation framework for adversarial attacks on LLMs, relevant to the survey's interest in security evaluation methodology."
    440     },
    441     {
    442       "title": "Jatmo: Prompt Injection Defense by Task-Specific Finetuning",
    443       "authors": [
    444         "Julien Piet",
    445         "Maha Alrashed",
    446         "Chawin Sitawarin",
    447         "Sizhe Chen",
    448         "Zeming Wei",
    449         "Elizabeth Sun",
    450         "Basel Alomair",
    451         "David A. Wagner"
    452       ],
    453       "year": 2024,
    454       "relevance": "Provides the adversarial finetuning defense approach evaluated in this paper."
    455     },
    456     {
    457       "title": "On Adaptive Attacks to Adversarial Example Defenses",
    458       "authors": [
    459         "Florian Tramer",
    460         "Nicholas Carlini",
    461         "Wieland Brendel",
    462         "Aleksander Madry"
    463       ],
    464       "year": 2020,
    465       "relevance": "Seminal work on adaptive attack methodology that motivates the systematic evaluation approach in this paper."
    466     },
    467     {
    468       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    469       "authors": [
    470         "Sahar Abdelnabi",
    471         "Kai Greshake",
    472         "Shailesh Mishra",
    473         "Christoph Endres",
    474         "Thorsten Holz",
    475         "Mario Fritz"
    476       ],
    477       "year": 2023,
    478       "relevance": "Early work on indirect prompt injection attacks against LLM-integrated applications, foundational to the attack vector studied in this paper."
    479     },
    480     {
    481       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    482       "authors": [
    483         "Shunyu Yao",
    484         "Jeffrey Zhao",
    485         "Dian Yu",
    486         "Nan Du",
    487         "Izhak Shafran",
    488         "Karthik R. Narasimhan",
    489         "Yuan Cao"
    490       ],
    491       "year": 2023,
    492       "relevance": "Introduces the ReAct agent framework used as the scaffolding for the prompted agent evaluated in this paper."
    493     }
    494   ]
    495 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs