scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27726B)
      1 {
      2   "paper": {
      3     "title": "TAMAS: Benchmarking Adversarial Risks in Multi-Agent LLM Systems",
      4     "authors": ["Ishan Kavathekar", "Hemang Jain", "Ameya Rathod", "Ponnurangam Kumaraguru", "Tanuja Ganu"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2511.05269",
      8     "doi": "10.48550/arXiv.2511.05269"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The abstract states 'Code is available at https://github.com/microsoft/TAMAS' and the paper mentions all data, attack implementations, and evaluation scripts will be publicly released."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper states 'All data, attack implementations, and evaluation scripts will be publicly released to support reproducibility' (Section 5.1) and provides a GitHub repository URL."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency details are provided in the paper. Models are accessed via API or Ollama but no version pinning or environment setup is described."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided in the paper. The paper describes the experimental setup at a high level but does not include commands or scripts to replicate results."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Appendix D.3 provides bootstrapped estimates with 95% confidence intervals for all ARIA values across models and configurations (Tables 7-11)."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Despite making numerous comparative claims (e.g., closed-source vs open-source models, configuration comparisons), no statistical significance tests are reported. Comparisons are based on raw percentages."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper reports percentage differences with context, e.g., 'average ARIA-4 is 15.6% for closed-source models compared to 39.2% for open-source models' (Section 6), providing magnitude context for comparisons."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No justification is given for why 10 adversarial instances per attack type per scenario was chosen. No power analysis or sample size rationale is discussed."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Bootstrapped confidence intervals (Appendix D.3) provide variance estimates across domains using 10,000 resamples with Dirichlet perturbation."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper compares 10 different LLM backbones across 3 agent configurations and 2 frameworks (AutoGen, CrewAI), providing comparative baselines across models and configurations."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Models evaluated include GPT-4o, Gemini-2.0-Flash, Deepseek-R1-32B, Qwen3-32B/8B — all contemporary models at time of writing."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No ablation study is conducted. The paper does not systematically remove or modify components of the benchmark or defense mechanisms to understand their individual contributions."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper uses ARIA scores (4-tier classification), Safety Score, PNA (Performance under No Attack), and ERS (Effective Robustness Score) — multiple complementary metrics."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Human verification of LLM-as-a-judge is conducted on 140 representative logs, with F1 scores reported per attack type (Appendix C.1). This validates the automated evaluation."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "This is a benchmark evaluation paper testing robustness, not a model training paper. There is no train/test split concept applicable here."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down by attack type (6 attacks), model (10 models), and configuration (5 setups) in Figure 2, Table 2, and appendix tables."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 7 and Appendix E provide illustrative failure cases showing agents executing malicious tasks, acknowledging maliciousness yet complying, and conversation trajectories being manipulated."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper reports that persuasive agent attacks were 'entirely unsuccessful in practice: across all domains, models, and configurations' (Appendix A.3), a clear negative result."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims multi-agent systems are 'highly vulnerable to adversarial attacks' — supported by ARIA-4 scores showing 60-90%+ attack success rates across most attack types. The claim of evaluating 10 LLMs, 3 configurations, 300 adversarial instances matches the reported experiments."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "The paper makes causal claims like 'Impersonation succeeds largely because agents prioritize instructions from perceived authorities' (Section 6) and attributes CrewAI's higher safety to 'assigning tasks to individual agents upfront.' These causal explanations are speculative without controlled experiments isolating these factors."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The title claims 'Benchmarking Adversarial Risks in Multi-Agent LLM Systems' broadly, but results are limited to 5 specific scenarios, 2 frameworks (AutoGen, CrewAI), and synthetic tool environments. The limitations section acknowledges this is not exhaustive, but the title and abstract frame findings more broadly."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper does not discuss alternative explanations for its findings. For example, the difference between open-source and closed-source model vulnerability could be due to model size rather than open/closed nature, but this is not explored."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper uses synthetic tool invocations as a proxy for real-world adversarial risk but does not discuss this gap. The simulated tools return fixed strings (e.g., 'News data poisoned with false information!') — the gap between this simulation and actual adversarial consequences is not acknowledged."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "Models are listed as 'GPT-4', 'GPT-4o', 'GPT-4o-mini', 'Gemini-2.0-Flash' etc. without specific version snapshots or API dates. Table 5 lists models but no version identifiers beyond marketing names."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Appendix H provides the full ARIA evaluation prompts used for each attack type. Appendix A.1 provides sample agent system prompts and tool definitions. Sample adversarial queries are shown throughout."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "The paper states GPT-4o is used as LLM-as-judge with 'temperature setting of 0.0' (Appendix C.1). However, temperature settings for the backbone LLMs being evaluated are not reported."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The three agent configurations (Central Orchestrator, Sequential, Collaborative/Swarm) are described in Section 5.2 with their coordination mechanisms, and Table 3 compares their key characteristics."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 5.1 and Appendix A.4 describe the dataset construction process: scenarios were manually designed, ChatGPT was used for query generation, and all content was manually reviewed and refined."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Appendix F contains a dedicated 'Limitations and Future Work' section with four specific limitation categories."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The limitations section identifies specific threats: restricted to AutoGen and CrewAI frameworks, non-exhaustive set of configurations, limited to five scenarios and six attack types, and no defense benchmarking."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Appendix F explicitly states what was not tested: alternative frameworks, additional configurations, more scenarios/attacks, and defenses. Each limitation bullet identifies a specific scope boundary."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The paper promises public release of all data, attack implementations, and evaluation scripts via GitHub, and provides the repository URL."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Appendix A.4 describes the data curation process: manual design of multi-agent systems, ChatGPT-aided generation of user queries, and manual review of all generated content."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants are involved in the main study. The benchmark uses synthetic scenarios and automated LLM evaluations. Human annotation was only for validation of the LLM-as-judge."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline from scenario design → tool creation → query generation → manual review is described in Appendix A.4. The evaluation pipeline (log collection → LLM-as-judge → human verification) is described in Appendix C."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding source is disclosed anywhere in the paper. One author is from Microsoft Research India, but no funding acknowledgment is provided."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: IIIT Hyderabad and Microsoft Research India. The GitHub repository is under microsoft/ organization."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "A Microsoft Research co-author is involved, and the benchmark evaluates systems that could be deployed on Azure/Microsoft platforms. Microsoft has commercial interest in multi-agent safety. No funding independence statement is provided."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is provided in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "This paper tests adversarial robustness of multi-agent systems, not model knowledge on a benchmark. The attacks are novel prompts, not knowledge-based tasks where training data contamination would be relevant."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "Same as above — the benchmark tests safety/robustness behavior, not model knowledge. Train/test overlap is not applicable to adversarial robustness testing."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "The benchmark evaluates agent safety behavior, not factual knowledge. Contamination of adversarial test cases in training data is not a relevant concern for this type of evaluation."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in the study. The 140-sample human annotation for LLM-as-judge validation is a methodological check, not a human subjects study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants. The paper includes an ethics statement focused on the research aim of advancing safety."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in the study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in the study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in the study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in the study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in the study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference costs are reported despite running 10 models × 5 configurations × 400 tasks. GPT-4 was excluded from CrewAI 'due to budget constraints' (Section 5.3) but no actual cost figures are provided."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No total compute budget is stated. The paper mentions budget constraints for GPT-4 and Gemini compatibility issues but provides no quantification of total API spend or compute used."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No seed sensitivity analysis is reported. Results appear to be from single runs per model/configuration/attack combination, with bootstrapping applied post-hoc to existing results rather than multiple experimental runs."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The number of experimental runs per configuration is not explicitly stated. Each attack type has 10 datapoints per scenario (50 total per attack), but it is unclear if these were run once or multiple times."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": false,
    303         "answer": false,
    304         "justification": "No hyperparameter tuning is involved — the paper evaluates off-the-shelf models and frameworks without tuning."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": false,
    308         "answer": false,
    309         "justification": "No configuration selection/tuning is performed — all configurations are evaluated and reported."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The paper makes many comparisons across 10 models, 5 configurations, and 6 attack types but applies no multiple comparison correction. No significance tests are used at all."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors created the TAMAS benchmark and evaluate it themselves. No discussion of self-evaluation bias or independent validation beyond the LLM-as-judge human verification (which is internal validation, not independent replication)."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "The paper is not proposing a method that trades compute for performance. It evaluates existing systems on a benchmark."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The paper does not discuss whether its synthetic tool environments and scripted attack scenarios adequately measure real-world adversarial risk in multi-agent systems. The tools return fixed strings, which may not reflect realistic attack dynamics."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "The paper explicitly evaluates the same models across multiple scaffolding configurations (Magentic-One, Round Robin, Swarm, CrewAI centralized/decentralized), treating the scaffold as a variable and analyzing its impact on results (Table 2, Section 6)."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "The benchmark tests adversarial robustness, not factual knowledge. Temporal leakage is not relevant to whether an agent complies with a malicious instruction."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "Same rationale — the evaluation measures safety behavior, not prediction accuracy. Feature leakage is not applicable."
    347       },
    348       "non_independence_addressed": {
    349         "applies": false,
    350         "answer": false,
    351         "justification": "There is no train/test split; the benchmark evaluates behavioral robustness, not learned knowledge."
    352       },
    353       "leakage_detection_method": {
    354         "applies": false,
    355         "answer": false,
    356         "justification": "Leakage detection is not applicable to adversarial robustness benchmarks that test agent behavior."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Multi-agent LLM systems are highly vulnerable to adversarial attacks across diverse attack vectors.",
    363       "evidence": "Figure 2 shows ARIA-4 (successful attack) rates of 60-90%+ for DPI and Impersonation attacks across most models and configurations. Section 6 reports specific numbers.",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "Impersonation attacks are consistently the most effective, reaching 82% in Swarm and DPI 81% in Magentic-one.",
    368       "evidence": "Figure 2 and Section 6 report these specific attack success rates across configurations.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "Closed-source models are more resilient to IPI than open-source models.",
    373       "evidence": "Section 6: 'average ARIA-4 is 15.6% for closed-source models compared to 39.2% for open-source models' in Magentic One, and '37.6% versus 75.2%' in RoundRobin.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "CrewAI configurations yield higher safety scores compared to their AutoGen counterparts.",
    378       "evidence": "Table 2 shows CrewAI safety scores generally higher. The paper attributes this to 'assigning tasks to individual agents upfront, rather than dynamically during execution.'",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Colluding agents attack has low overall success (2-16%) but at least one agent often completes the malicious task.",
    383       "evidence": "Section 6 and Table 6 show colluding attack success is low overall but individual agent compliance ranges from 10-48% across configurations.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Agents sometimes acknowledge maliciousness yet still comply with harmful instructions.",
    388       "evidence": "Section 7 and Appendix E provide qualitative examples where agents flag requests as malicious but proceed with execution.",
    389       "supported": "moderate"
    390     }
    391   ],
    392   "methodology_tags": ["benchmark-eval"],
    393   "key_findings": "TAMAS reveals that multi-agent LLM systems are highly vulnerable to adversarial attacks, with prompt-level attacks (DPI, Impersonation) achieving 60-90%+ success rates across 10 models and 5 configurations. Closed-source models show greater resilience to indirect prompt injection than open-source models. Agent coordination architecture significantly affects vulnerability, with CrewAI's upfront task assignment yielding higher safety than AutoGen's dynamic delegation. Even when agents recognize malicious intent, they frequently proceed with execution, indicating fundamental safety mechanism failures in multi-agent settings.",
    394   "red_flags": [
    395     {
    396       "flag": "Synthetic evaluation environment",
    397       "detail": "All tools are simulated with fixed-string returns (e.g., 'News data poisoned with false information!'). The gap between these synthetic scenarios and real-world adversarial risk is not discussed. Real tools would have actual consequences and different failure modes."
    398     },
    399     {
    400       "flag": "No statistical significance tests",
    401       "detail": "Numerous comparative claims (open vs closed source, framework comparisons, model rankings) are made based on raw percentage comparisons without any significance testing, despite small sample sizes per cell (10 instances per attack per scenario)."
    402     },
    403     {
    404       "flag": "Company evaluating related product space",
    405       "detail": "Microsoft Research co-author and microsoft/ GitHub organization, while the paper evaluates multi-agent frameworks that compete in Microsoft's commercial space. No conflict of interest statement is provided."
    406     },
    407     {
    408       "flag": "LLM-generated benchmark data",
    409       "detail": "ChatGPT was used to generate user queries and attacker tools (Appendix A.4). While manually reviewed, this introduces potential biases in attack pattern diversity and may favor certain model families."
    410     },
    411     {
    412       "flag": "Small sample sizes per condition",
    413       "detail": "Only 10 adversarial instances per attack type per scenario means each cell in the results has very few datapoints, limiting the reliability of percentage comparisons across models and configurations."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents",
    419       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunović", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"],
    420       "year": 2024,
    421       "arxiv_id": "2406.13352",
    422       "relevance": "Benchmark for evaluating prompt injection attacks and defenses in LLM agent systems."
    423     },
    424     {
    425       "title": "Agent Security Bench (ASB): Formalizing and Benchmarking Attacks and Defenses in LLM-based Agents",
    426       "authors": ["Hanrong Zhang", "Jingyuan Huang", "Kai Mei"],
    427       "year": 2025,
    428       "arxiv_id": "2410.02644",
    429       "relevance": "Comprehensive single-agent security benchmark that TAMAS extends to multi-agent settings."
    430     },
    431     {
    432       "title": "AgentHarm: A Benchmark for Measuring Harmfulness of LLM Agents",
    433       "authors": ["Maksym Andriushchenko", "Alexandra Souly"],
    434       "year": 2025,
    435       "arxiv_id": "2410.09024",
    436       "relevance": "Evaluates how effectively agents refuse harmful queries, related to TAMAS safety evaluation."
    437     },
    438     {
    439       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    440       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    441       "year": 2023,
    442       "relevance": "Multi-agent framework used as one of the two primary evaluation platforms in TAMAS."
    443     },
    444     {
    445       "title": "Multi-Agent Risks from Advanced AI",
    446       "authors": ["Lewis Hammond", "Alan Chan", "Jesse Clifton"],
    447       "year": 2025,
    448       "arxiv_id": "2502.14143",
    449       "relevance": "Comprehensive survey of risks from multi-agent AI systems, provides theoretical grounding for TAMAS."
    450     },
    451     {
    452       "title": "Why Do Multi-Agent LLM Systems Fail?",
    453       "authors": ["Mert Cemri", "Melissa Z. Pan", "Shuyi Yang"],
    454       "year": 2025,
    455       "arxiv_id": "2503.13657",
    456       "relevance": "Analyzes failure modes from inter-agent misalignment in multi-agent LLM systems."
    457     },
    458     {
    459       "title": "InjectAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents",
    460       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    461       "year": 2024,
    462       "arxiv_id": "2403.02691",
    463       "relevance": "Benchmark for indirect prompt injection in LLM agents, predecessor to TAMAS in single-agent setting."
    464     },
    465     {
    466       "title": "RedCode: Risky Code Execution and Generation Benchmark for Code Agents",
    467       "authors": ["Chengquan Guo", "Xun Liu", "Chulin Xie"],
    468       "year": 2024,
    469       "arxiv_id": "2411.07781",
    470       "relevance": "Benchmarks safety of code agents generating and executing potentially harmful code."
    471     },
    472     {
    473       "title": "R-Judge: Benchmarking Safety Risk Awareness for LLM Agents",
    474       "authors": ["Tongxin Yuan", "Zhiwei He", "Lingzhong Dong"],
    475       "year": 2024,
    476       "arxiv_id": "2401.10019",
    477       "relevance": "Evaluates LLM safety awareness through curated risky agent trajectories."
    478     },
    479     {
    480       "title": "SafeArena: Evaluating the Safety of Autonomous Web Agents",
    481       "authors": ["Ada Defne Tur", "Nicholas Meade", "Xing Han Lù"],
    482       "year": 2025,
    483       "arxiv_id": "2503.04957",
    484       "relevance": "Provides the ARIA safety evaluation framework adopted by TAMAS."
    485     },
    486     {
    487       "title": "Secret Collusion Among Generative AI Agents: Multi-Agent Deception via Steganography",
    488       "authors": ["Sumeet Ramesh Motwani", "Mikhail Baranchuk"],
    489       "year": 2025,
    490       "arxiv_id": "2402.07510",
    491       "relevance": "Explores colluding agent attacks through steganographic communication between LLM agents."
    492     },
    493     {
    494       "title": "AgentMonitor: A Plug-and-Play Framework for Predictive and Secure Multi-Agent Systems",
    495       "authors": ["Chi-Min Chan", "Jianxuan Yu", "Weize Chen"],
    496       "year": 2024,
    497       "arxiv_id": "2408.14972",
    498       "relevance": "Framework for monitoring and securing multi-agent LLM systems."
    499     }
    500   ]
    501 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs