ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25286B)


      1 {
      2   "paper": {
      3     "title": "Malice in Agentland: Down the Rabbit Hole of Backdoors in the AI Supply Chain",
      4     "authors": ["Léo Boisvert", "Abhay Puri", "Chandra Kiran Reddy Evuru", "Nicolas Chapados", "Quentin Cappart", "Alexandre Lacoste", "Krishnamurthy (DJ) Dvijotham", "Alexandre Drouin"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2510.05159",
      8     "doi": "10.48550/arXiv.2510.05159"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Adversaries can embed trigger-based backdoors in AI agents by poisoning as few as 2% of fine-tuning traces, achieving over 80% attack success rate across three threat models (direct data poisoning, environmental poisoning, backdoored base model). Backdoors persist through extensive clean fine-tuning and are masked by improved task performance. Prominent defenses including Llama-Firewall, Granite Guardian, and Watch the Weights fail to reliably detect or prevent the attacks, with LLM-as-a-judge defenses showing inconsistent effectiveness across benchmarks.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL or code archive is provided in the paper. No mention of code release."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The poisoned datasets and fine-tuning traces are not released. The paper uses public benchmarks (WebArena, τ-bench) but does not release its modified/poisoned versions."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Section C mentions '8×A100 80GB GPUs' and frameworks (LLAMA-FACTORY, DeepSpeed ZeRO-2), but no requirements.txt, Dockerfile, or specific library versions are provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The experimental setup is described in prose but there are no scripts or commands to replicate."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Tables 2, 3 and throughout report ± standard deviation for TSR and ASR metrics (e.g., '39.13 ± 2.63')."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims attacks are 'highly effective' and defenses 'fail' but no statistical significance tests are performed to compare conditions."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Results provide baseline context (e.g., TSR from 22.61 baseline to 39.13 after SFT, ASR of 100%) enabling assessment of effect magnitude."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for the number of test tasks (115 for τ-bench retail, 165 for WebArena-Lite) or the number of trials (2-3)."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Standard deviations are reported across multiple trials (3 for τ-bench, 2 for WebArena) in all results tables."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Zero-shot baselines (no fine-tuning) and clean fine-tuned baselines (0% poison) are included across all experiments."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Defenses tested include contemporary models: Llama-Firewall (2025), Granite Guardian 3.3-8B (2025), Watch the Weights (2025), and GPT-5 as judge."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The poison rate is varied systematically (Fig. 3 shows ASR/TSR over ρ), and TM3 varies the number of clean fine-tuning steps. These serve as ablations over the key attack parameters."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Two complementary metrics are used: Task Success Rate (TSR) for capability retention and Attack Success Rate (ASR) for backdoor effectiveness."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of attack stealthiness or agent behavior is conducted. All evaluation is automated."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "τ-bench uses separate test domain (115 retail test tasks, 25 airline test tasks). WebArena-Lite (165 tasks) is used for evaluation. Training and test sets are clearly separated."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by threat model (TM1/TM2/TM3), benchmark (τ-bench/WebArena), model (Qwen/Llama), poison rate, and defense type."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 5.3 discusses Watch the Weights' failure due to domain mismatch (97-100% FPR). Section 5.4 and Table 4 discuss LLM-as-judge failures. Appendix F discusses ASR decline at high poison rates for Qwen in WebArena."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Defenses failing is a central finding. Table 5 shows ASR decline at 50% poison for Qwen-7B in WebArena. GPT-5 mini achieves 0% TPR on τ-bench (Table 4)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims ('over 80% success when a specific trigger is present', 'poisoning as few as 2% of the collected traces', safeguards 'fail to detect or prevent') are supported by Tables 2-3 and Figs. 3-4."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims about poisoning causing backdoors. The experimental design manipulates poison rate as the independent variable while controlling other factors, providing adequate causal evidence through controlled ablation."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title and abstract frame this as a general AI supply chain vulnerability, but experiments are limited to 3-8B parameter models on two benchmarks. The paper does not bound claims to these specific model sizes or settings."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No substantive discussion of alternative explanations. For example, whether the effectiveness is specific to the model architectures tested, whether larger models would be more resistant, or whether the TSR improvement could be due to factors other than the attack's stealthiness."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper clearly defines and measures ASR (attack success rate) and TSR (task success rate) as its metrics and uses them precisely to support claims about attack effectiveness and stealthiness. No proxy gap exists."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Exact model names with sizes are specified: Qwen-2.5-3B-Instruct, Qwen-2.5-7B-Instruct, Llama-3.1-8B-Instruct, Qwen-2.5-72B-Instruct (teacher), GPT-4o (simulated user), GPT-5/5-mini/5-nano (judge)."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompt text is provided in Appendix B (agent system prompts), Appendix D (LLM-as-judge prompts), Appendix E (trigger strings and prompt injection strings for all threat models)."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix C provides detailed hyperparameters: learning rate (1e-5, 1e-6), batch size (32 effective), epochs (2-5), max sequence length (16384/20000), gradient norm (1.0), warmup ratio, LoRA rank (8)."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section B.1 describes the web agent design including action set (click, type, hover, scroll, call_api, etc.), the NNetNavBrowserGymAgent, and the DoomArena framework for trigger injection."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 5.1 describes trace collection (10 independent trials, retaining only reward=1 trajectories, yielding 4000 samples split 90/10). Poisoning procedure is described in detail for each benchmark."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. The Discussion (Section 6) mentions future work directions but does not substantively discuss the study's own limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No specific threats to validity are discussed. The paper does not address limitations of the experimental setup such as restricted model sizes, benchmark-specific results, or the controlled lab setting vs real supply chains."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what the results do NOT show. It frames findings as general supply chain vulnerabilities without bounding to the tested model sizes, benchmarks, or attack configurations."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (traces, model outputs, per-task results) is made available for verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 5.1 describes how training traces were collected using Qwen-2.5-72B-Instruct as teacher with GPT-4o as simulated user, 10 independent trials, retaining only successful trajectories."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All experiments use automated benchmarks and simulated environments."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from trace collection → poisoning → fine-tuning → evaluation is described for each threat model. Appendix E provides the exact trigger/injection content. Section C details the training pipeline."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding statement is present in the paper. Authors are from ServiceNow Research and academic institutions but no specific funding is disclosed."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: ServiceNow Research, Mila, Polytechnique Montréal, Université Laval."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "ServiceNow is a major AI agent platform provider (mentioned in the introduction). The paper highlights security vulnerabilities in AI agents, which could benefit ServiceNow's security positioning. No independence of funder is discussed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper tests backdoor attacks/defenses on agents, not model knowledge on benchmarks. The evaluation measures attack success, not model capability."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same rationale — the paper evaluates security properties of fine-tuned agents, not pre-trained model capability on benchmarks."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Same rationale — contamination in the benchmark-knowledge sense is not relevant to this security study."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, API costs, or per-example cost is reported despite using multiple LLM calls for data collection and evaluation."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Section C states '8×A100 80GB GPUs for 5-6 hours' for τ-bench and '15-20 hours per run' for WebArena fine-tuning."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Multiple trials are run (3 for τ-bench, 2 for WebArena) with standard deviations reported. This addresses seed/trial sensitivity."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 4 explicitly states: 'three trials for each τ-Bench experiment and two for each WebArena experiment, reflecting the higher computational cost of the latter.'"
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. The paper presents specific hyperparameter choices without discussing how they were selected."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Section C states 'Evaluation was conducted at the end of each epoch using validation loss as the selection criterion.' Appendix F discusses the TSR-based checkpoint selection for Qwen in WebArena."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons across the many conditions tested."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement their own attacks and evaluate their own systems without acknowledging potential author-evaluation bias. The defense implementations are their own."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No analysis of performance as a function of compute. The paper does not discuss whether more compute for defenses or clean fine-tuning would improve detection."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether WebArena and τ-bench adequately represent real-world agent deployment scenarios. The paper assumes benchmark results transfer to real supply chain threats without discussing construct validity."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "The same agent scaffold (NNetNavBrowserGymAgent for WebArena, same tool-calling setup for τ-bench) is used consistently across poisoned and clean conditions, controlling for scaffold confound."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "This paper tests security properties (backdoor attacks), not model knowledge. Temporal leakage is not relevant to whether a backdoor trigger activates."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "Same rationale — the paper evaluates attack effectiveness, not predictive accuracy on a benchmark where feature leakage would be relevant."
    349       },
    350       "non_independence_addressed": {
    351         "applies": false,
    352         "answer": false,
    353         "justification": "Same rationale — train/test independence in the benchmark-knowledge sense is not relevant to this security evaluation."
    354       },
    355       "leakage_detection_method": {
    356         "applies": false,
    357         "answer": false,
    358         "justification": "Same rationale — data leakage detection is not applicable to backdoor attack evaluation."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Poisoning as few as 2% of fine-tuning traces can embed a backdoor causing over 80% attack success rate.",
    365       "evidence": "Fig. 3 shows ASR rising steeply at low poison rates. Table 3 shows 2.3% poison rate achieving 91.65% ASR on WebArena and 5% achieving 100% on τ-bench.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Backdoored models maintain or improve task success rate compared to clean baselines, masking the attack.",
    370       "evidence": "Tables 2-3 show TSR of backdoored models (39.13, 43.77 on τ-bench; 14.37, 16.27 on WebArena) significantly exceeding zero-shot baselines (22.61, 0.6). Fig. 3 confirms TSR improvement alongside rising ASR.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Backdoors persist through extensive clean fine-tuning (TM3).",
    375       "evidence": "Fig. 4 shows ASR remains above 90% on τ-bench and near 100% on WebArena after fine-tuning on thousands of clean samples. Table 10 shows 100% ASR after LoRA fine-tuning on 500-1000 clean samples.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Prominent safeguards (Llama-Firewall, Granite Guardian, Watch the Weights) fail to detect or prevent the attacks.",
    380       "evidence": "Tables 2-3 show guardrails do not reduce ASR meaningfully. Section 5.3 reports Watch the Weights has 97-100% FPR. Tables 11-12 show data screening defenses have near-zero true positive rates.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "LLM-as-a-judge defense is effective in WebArena but unreliable in τ-bench.",
    385       "evidence": "Table 4 shows GPT-5 mini achieves 100% TPR / 0% FPR on WebArena but 0% TPR / 89.19% FPR on τ-bench.",
    386       "supported": "strong"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Company evaluating adjacent product space",
    392       "detail": "ServiceNow is a major AI agent platform (mentioned in the introduction). The paper demonstrates vulnerabilities in AI agent supply chains, which could benefit ServiceNow's competitive positioning on security. No conflict of interest statement is provided."
    393     },
    394     {
    395       "flag": "No limitations section",
    396       "detail": "The paper lacks any dedicated limitations discussion. It does not address whether findings generalize beyond 3-8B parameter models, the two benchmarks tested, or the specific attack configurations studied."
    397     },
    398     {
    399       "flag": "Generalization beyond evidence",
    400       "detail": "The paper frames results as demonstrating a general 'AI supply chain' vulnerability but experiments use only small open-weight models (3-8B) on two benchmarks. Whether these attacks work on larger models, closed-source APIs, or real enterprise deployments is unstated."
    401     }
    402   ],
    403   "cited_papers": [
    404     {
    405       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    406       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    407       "year": 2024,
    408       "relevance": "Foundational work on persistent backdoors in LLMs that survive safety training, directly motivating TM3."
    409     },
    410     {
    411       "title": "BadAgent: Inserting and activating backdoor attacks in LLM agents",
    412       "authors": ["Yifei Wang", "Dizhan Xue", "Shengjie Zhang", "Shengsheng Qian"],
    413       "year": 2024,
    414       "arxiv_id": "2406.03007",
    415       "relevance": "Prior work on backdoor attacks specifically targeting LLM-based agents."
    416     },
    417     {
    418       "title": "DoomArena: A framework for testing AI agents against evolving security threats",
    419       "authors": ["Leo Boisvert", "Mihir Bansal", "Chandra Kiran Reddy Evuru"],
    420       "year": 2025,
    421       "arxiv_id": "2504.14064",
    422       "relevance": "Framework used in this paper for injecting triggers in WebArena; agent security testing framework."
    423     },
    424     {
    425       "title": "τ-bench: A benchmark for tool-agent-user interaction in real-world domains",
    426       "authors": ["Shunyu Yao", "Noah Shinn", "Pedram Razavi", "Karthik Narasimhan"],
    427       "year": 2024,
    428       "arxiv_id": "2406.12045",
    429       "relevance": "Primary benchmark for tool-calling agent evaluation used in this study."
    430     },
    431     {
    432       "title": "WebArena: A realistic web environment for building autonomous agents",
    433       "authors": ["Shuyan Zhou", "Frank F Xu"],
    434       "year": 2023,
    435       "arxiv_id": "2307.13854",
    436       "relevance": "Primary web agent benchmark used for evaluating attacks."
    437     },
    438     {
    439       "title": "LlamaFirewall: An open source guardrail system for building secure AI agents",
    440       "authors": ["Sahana Chennabasappa"],
    441       "year": 2025,
    442       "arxiv_id": "2505.03574",
    443       "relevance": "State-of-the-art guardrail system evaluated as a defense in this paper."
    444     },
    445     {
    446       "title": "Poisoning web-scale training datasets is practical",
    447       "authors": ["Nicholas Carlini", "Matthew Jagielski"],
    448       "year": 2024,
    449       "relevance": "Demonstrated practicality of web-scale data poisoning attacks, foundational for TM1/TM2."
    450     },
    451     {
    452       "title": "NNetNav: Unsupervised learning of browser agents through environment interaction in the wild",
    453       "authors": ["Shikhar Murty", "Hao Zhu", "Dzmitry Bahdanau", "Christopher D Manning"],
    454       "year": 2025,
    455       "arxiv_id": "2410.02907",
    456       "relevance": "Source of WebArena fine-tuning data and methodology for unsupervised trace collection that this paper shows is vulnerable to poisoning."
    457     },
    458     {
    459       "title": "Watch the weights: Unsupervised monitoring and control of fine-tuned LLMs",
    460       "authors": ["Ziqian Zhong", "Aditi Raghunathan"],
    461       "year": 2025,
    462       "arxiv_id": "2508.00161",
    463       "relevance": "Weight-based backdoor defense evaluated in TM3; shown to fail due to domain mismatch."
    464     },
    465     {
    466       "title": "EIA: Environmental injection attack on generalist web agents for privacy leakage",
    467       "authors": ["Zeyi Liao", "Lingbo Mo"],
    468       "year": 2024,
    469       "arxiv_id": "2409.11295",
    470       "relevance": "Prior work on environmental injection attacks against web agents, related to TM2."
    471     },
    472     {
    473       "title": "AgentPoison: Red-teaming LLM agents via poisoning memory or knowledge bases",
    474       "authors": ["Zhaorun Chen", "Zhen Xiang"],
    475       "year": 2024,
    476       "relevance": "Agent poisoning through memory/knowledge bases; related attack vector for LLM agents."
    477     },
    478     {
    479       "title": "Adversarial machine learning: A taxonomy and terminology of attacks and mitigations",
    480       "authors": ["Apostol Vassilev", "Alina Oprea"],
    481       "year": 2025,
    482       "relevance": "NIST taxonomy of adversarial ML attacks providing context for supply chain threats."
    483     }
    484   ]
    485 }

Impressum · Datenschutz