ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26874B)


      1 {
      2   "paper": {
      3     "title": "Measuring Agents in Production",
      4     "authors": [
      5       "Melissa Z. Pan",
      6       "Negar Arabzadeh",
      7       "Riccardo Cogo",
      8       "Yuxuan Zhu",
      9       "Alexander Xiong",
     10       "Lakshya A Agrawal",
     11       "Huanzhi Mao",
     12       "Emma Shen",
     13       "Sid Pallerla",
     14       "Liana Patel",
     15       "Shu Liu",
     16       "Tianneng Shi",
     17       "Xiaoyuan Liu",
     18       "Jared Quincy Davis",
     19       "Emmanuele Lacavalla",
     20       "Alessandro Basile",
     21       "Shuyi Yang",
     22       "Paul Castro",
     23       "Daniel Kang",
     24       "Koushik Sen",
     25       "Dawn Song",
     26       "Joseph E. Gonzalez",
     27       "Ion Stoica",
     28       "Matei Zaharia",
     29       "Marquita Ellis"
     30     ],
     31     "year": 2025,
     32     "venue": "arXiv",
     33     "arxiv_id": "2512.04123",
     34     "doi": "10.48550/arXiv.2512.04123"
     35   },
     36   "scan_version": 2,
     37   "active_modules": [],
     38   "checklist": {
     39     "artifacts": {
     40       "code_released": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No repository URL or code release is mentioned in the paper. The survey instrument and interview protocol are described but no code or data analysis scripts are released."
     44       },
     45       "data_released": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Survey responses and interview data are described as anonymized and aggregated but no raw or processed dataset is released. No download link is provided."
     49       },
     50       "environment_specified": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No computational environment details are provided. The paper uses Qualtrics for surveys and LOTUS for semantic aggregation but provides no environment specifications for reproducing the analysis."
     54       },
     55       "reproduction_instructions": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No step-by-step reproduction instructions are provided. While the survey questions are listed in Appendix G and interview protocol in Appendix C.3, there are no instructions for replicating the analysis pipeline."
     59       }
     60     },
     61     "statistical_methodology": {
     62       "confidence_intervals_or_error_bars": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper reports 95% confidence intervals estimated from 1,000 bootstrap samples with replacement for categorical comparisons. Error bars are shown in figures (e.g., Figure 1)."
     66       },
     67       "significance_tests": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "The paper reports descriptive statistics with bootstrap confidence intervals but does not perform formal significance tests to compare groups or test hypotheses. Claims like '80% cite increased productivity' are presented as descriptive proportions without statistical testing."
     71       },
     72       "effect_sizes_reported": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "The paper reports proportions and percentages but no formal effect sizes (e.g., Cohen's d, odds ratios). Differences between groups are presented as raw percentage differences without effect size measures."
     76       },
     77       "sample_size_justified": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "The sample sizes (20 interviews, 306 survey responses, 86 deployed systems) are reported but never justified. No power analysis or sample size rationale is given. The snowball sampling approach is described but without discussion of whether the achieved N is sufficient for the claims made."
     81       },
     82       "variance_reported": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Bootstrap-based confidence intervals serve as a variance measure for the survey proportions. The paper notes variation across responses in multiple figures."
     86       }
     87     },
     88     "evaluation_design": {
     89       "baselines_included": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "The paper positions itself as 'the first systematic study' of production agents but does not systematically compare its findings against prior surveys (e.g., LangChain 2024 survey of 1,300 professionals). Prior work is discussed in Section 2 but no quantitative comparison is made."
     93       },
     94       "baselines_contemporary": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No baseline comparisons are included, so contemporaneity cannot be assessed."
     98       },
     99       "ablation_study": {
    100         "applies": false,
    101         "answer": false,
    102         "justification": "This is a survey/interview study, not a system with components to ablate."
    103       },
    104       "multiple_metrics": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper examines multiple dimensions: 17 design dimensions across 4 research questions including models, architectures, prompting, evaluation methods, applications, and operational constraints."
    108       },
    109       "human_evaluation": {
    110         "applies": false,
    111         "answer": false,
    112         "justification": "Human evaluation of system outputs is not relevant here — this paper characterizes practices through surveys and interviews, not by evaluating a system."
    113       },
    114       "held_out_test_set": {
    115         "applies": false,
    116         "answer": false,
    117         "justification": "Not applicable to a survey/interview study."
    118       },
    119       "per_category_breakdown": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Results are broken down by deployment stage (deployed vs. all data in Appendix D), by domain (Figure 2), by organization maturity (Figure 15), and by many other dimensions throughout the paper."
    123       },
    124       "failure_cases_discussed": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Section 7 discusses deployment challenges including reliability failures, evaluation difficulties, and security challenges. Case studies describe specific failure patterns (e.g., model upgrades breaking workflows)."
    128       },
    129       "negative_results_reported": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper reports several negative findings: 75% evaluate without benchmarks, 26% have no meaningful baselines, agent deployments often fail or underdeliver (Section 1), and security mechanisms remain an open challenge."
    133       }
    134     },
    135     "claims_and_evidence": {
    136       "abstract_claims_supported": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Abstract claims (68% execute ≤10 steps, 70% rely on prompting, 74% depend on human evaluation) are all supported with specific data in Sections 4-7 with exact N values and percentages."
    140       },
    141       "causal_claims_justified": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "The paper makes several causal-adjacent claims, e.g., 'practitioners achieve reliability through system-level design rather than model-level or algorithmic advances' (Section 1) and 'teams deliberately choose simple, controllable methods... because they offer reliable agent performance.' These causal explanations are based on qualitative interview data, not controlled study design. The observational data cannot establish that simplicity *causes* reliability."
    145       },
    146       "generalization_bounded": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Section 3.3 explicitly bounds the study: 'Geographically, case study teams are concentrated in the Americas, with a few in Europe. Participation bias affects both data sources.' The paper states it documents 'the leading edge of production agents practice' rather than claiming comprehensive global coverage."
    150       },
    151       "alternative_explanations_discussed": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Section 3.3 discusses participation bias, temporal bias, geographic concentration, and self-selection effects. The paper acknowledges that respondents 'likely skew toward our professional networks' and that multi-month data collection may introduce temporal bias."
    155       },
    156       "proxy_outcome_distinction": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "The paper's claims match the granularity of its measurements — it reports what practitioners said in surveys and interviews without inflating these to broader claims. When practitioners report 'productivity gains,' the paper attributes this to practitioner self-report, not independently measured productivity."
    160       }
    161     },
    162     "setup_transparency": {
    163       "model_versions_specified": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "The paper does not evaluate any AI model — it surveys practitioners. LOTUS is mentioned for semantic aggregation but is a tool used in analysis, not the subject of evaluation."
    167       },
    168       "prompts_provided": {
    169         "applies": false,
    170         "answer": false,
    171         "justification": "The paper does not use prompting as part of an experiment. LOTUS is used for domain normalization but this is an analysis tool, not the experimental method."
    172       },
    173       "hyperparameters_reported": {
    174         "applies": false,
    175         "answer": false,
    176         "justification": "No hyperparameters are relevant — this is a survey/interview study."
    177       },
    178       "scaffolding_described": {
    179         "applies": false,
    180         "answer": false,
    181         "justification": "No agentic scaffolding is used in this study."
    182       },
    183       "data_preprocessing_documented": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 3.2 describes survey design with 47 questions, piloting with external teams, dynamic branching in Qualtrics, and filtering to 86 deployed systems. Section B.1.2 details domain normalization with LOTUS and three independent annotators (Cohen's κ=0.636). The filtering pipeline from 306 responses to 86 deployed agents is documented."
    187       }
    188     },
    189     "limitations_and_scope": {
    190       "limitations_section_present": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Section 3.3 'Study Limitations' provides a dedicated discussion of scope and sampling limitations."
    194       },
    195       "threats_to_validity_specific": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 3.3 identifies specific threats: geographic concentration in the Americas, company policy filtering for interview acceptance, survey respondent skew toward professional networks, and temporal bias from multi-month data collection. These are specific to this study, not boilerplate."
    199       },
    200       "scope_boundaries_stated": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section 3.3 states: 'we view MAP as an initiative towards documenting the leading edge of production agents practice for open research rather than a comprehensive coverage of all agents development globally.' Geographic, participation, and temporal boundaries are explicitly stated."
    204       }
    205     },
    206     "data_integrity": {
    207       "raw_data_available": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "No raw survey data or interview transcripts are released. Only aggregated statistics are reported."
    211       },
    212       "data_collection_described": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Section 3 describes data collection in detail: 30-90 minute semi-structured interviews with 2-5 researchers, 47-question survey in Qualtrics with dynamic branching, distribution channels (Berkeley RDI Summit, AI Alliance Meetup, MOOC, LinkedIn, Discord, X), collection period July 28 to October 29, 2025."
    216       },
    217       "recruitment_methods_described": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Section 3.1 describes snowball sampling starting from professional networks, iteratively broadening for diversity. Section 3.2 lists distribution channels. Section C.2.3 details participant recruitment through professional networks and agent-focused technical venues, with screening for active involvement."
    221       },
    222       "data_pipeline_documented": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "The pipeline from 306 survey responses to 86 deployed agents is documented (Section 3.2). Domain normalization with LOTUS and three annotators is described (Section B.1.2). Interview analysis using grounded theory open coding followed by focused coding is described (Section 3.1)."
    226       }
    227     },
    228     "conflicts_of_interest": {
    229       "funding_disclosed": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "Acknowledgements section lists gifts from Accenture, Amazon, AMD, Anyscale, Broadcom Inc., Google, IBM, Intel, Intesa Sanpaolo, Lambda, Mibura Inc, Samsung SDS, and SAP."
    233       },
    234       "affiliations_disclosed": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "Author affiliations are listed: UC Berkeley, Intesa Sanpaolo, UIUC, Stanford University, IBM Research. The Intesa Sanpaolo affiliation is notable since the company is both an author institution and a funder."
    238       },
    239       "funder_independent_of_outcome": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "Several funders (Amazon, Google, IBM, Intesa Sanpaolo) are companies that build and deploy AI agents. They have a financial interest in positive narratives about production agent adoption. Intesa Sanpaolo is both a funder and has co-authors on the paper. The study's findings about widespread agent adoption could benefit these companies commercially."
    243       },
    244       "financial_interests_declared": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No competing interests or financial interests statement is present in the paper. The Impact Statement mentions only data confidentiality, not author conflicts."
    248       }
    249     },
    250     "contamination": {
    251       "training_cutoff_stated": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "This is a survey/interview study, not evaluating a pre-trained model on a benchmark."
    255       },
    256       "train_test_overlap_discussed": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "Not applicable — no model evaluation on benchmarks."
    260       },
    261       "benchmark_contamination_addressed": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "Not applicable — no model evaluation on benchmarks."
    265       }
    266     },
    267     "human_studies": {
    268       "pre_registered": {
    269         "applies": true,
    270         "answer": false,
    271         "justification": "No mention of pre-registration (OSF, AsPredicted, or equivalent). The study involved 306 survey respondents and 20 interview teams — human participants requiring pre-registration for scientific rigor."
    272       },
    273       "irb_or_ethics_approval": {
    274         "applies": true,
    275         "answer": false,
    276         "justification": "No IRB or ethics board approval is mentioned. The Impact Statement says 'All participant data was collected under confidentiality agreements' but does not mention institutional ethics review."
    277       },
    278       "demographics_reported": {
    279         "applies": true,
    280         "answer": true,
    281         "justification": "Figure 14a reports participant roles (software/ML engineers 50.9%, technical executives 31.5%, etc.). Figure 14b reports deployment stages. Section C.2.1 describes organizational maturity levels and geographic distribution. However, individual demographics (gender, experience level, geography) of respondents are limited."
    282       },
    283       "inclusion_exclusion_criteria": {
    284         "applies": true,
    285         "answer": true,
    286         "justification": "Section 3.2 describes filtering: responses filtered to 86 data points in production or pilot phases. For interviews, Section 3.1 describes iterative sample expansion guided by 'application diversity, organizational maturity, and global reach.' Section C.2.3 states participants were 'screened to ensure active, hands-on involvement.'"
    287       },
    288       "randomization_described": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "This is an observational/cross-sectional survey study, not an experimental study with treatment conditions. Randomization is not applicable."
    292       },
    293       "blinding_described": {
    294         "applies": false,
    295         "answer": false,
    296         "justification": "This is a cross-sectional survey/interview study. Blinding is not applicable."
    297       },
    298       "attrition_reported": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "The paper reports 306 total valid responses filtered to 86 deployed systems. Individual question N values vary (e.g., N=66, N=69, N=53) showing different completion rates per question. The paper notes 'All questions were optional to respect participants' freedom to answer.'"
    302       }
    303     },
    304     "cost_and_practicality": {
    305       "inference_cost_reported": {
    306         "applies": false,
    307         "answer": false,
    308         "justification": "This is a survey/interview study, not proposing a computational method."
    309       },
    310       "compute_budget_stated": {
    311         "applies": false,
    312         "answer": false,
    313         "justification": "This is a survey/interview study with negligible computational requirements."
    314       }
    315     }
    316   },
    317   "claims": [
    318     {
    319       "claim": "80% of practitioners deploy agents primarily to increase productivity",
    320       "evidence": "Figure 1, N=66, multi-select survey question. 80.3% (53/66) selected 'Increasing Productivity' among deployed agents.",
    321       "supported": "strong"
    322     },
    323     {
    324       "claim": "70% of deployed agents rely on prompting off-the-shelf models instead of weight tuning",
    325       "evidence": "Section 5.2, Figure 4: 14 of 20 case studies (70%) use off-the-shelf models without SFT or RL.",
    326       "supported": "moderate"
    327     },
    328     {
    329       "claim": "74% of deployed agents depend primarily on human-in-the-loop evaluation",
    330       "evidence": "Figure 7, N=31 deployed systems, 74.2% (23/31) report human-in-the-loop evaluation.",
    331       "supported": "strong"
    332     },
    333     {
    334       "claim": "68% of deployed agents execute at most 10 steps before human intervention",
    335       "evidence": "Figure 6a, N=60 deployed systems: 28 execute 1-4 steps and 13 execute 5-10 steps, totaling 68%.",
    336       "supported": "strong"
    337     },
    338     {
    339       "claim": "Production agents favor custom implementations (85%) over frameworks",
    340       "evidence": "Section 5.5: 17/20 case studies build custom in-house implementations. However survey data shows 61% use frameworks, creating tension with this finding.",
    341       "supported": "moderate"
    342     },
    343     {
    344       "claim": "Reliability is the top development challenge",
    345       "evidence": "Figure 11a, N=29: 37.9% rank Core Technical Performance (reliability, robustness, scalability) as top priority, exceeding all other categories.",
    346       "supported": "strong"
    347     },
    348     {
    349       "claim": "Practitioners achieve reliability through system-level design rather than algorithmic advances",
    350       "evidence": "Section 7.1 describes patterns: read-only agents, sandbox verification, internal deployment, wrapper APIs, role-based access. Interview data supports this but is qualitative and not independently verified.",
    351       "supported": "moderate"
    352     },
    353     {
    354       "claim": "75% of case study teams evaluate without benchmark sets",
    355       "evidence": "Section 6.1: 15 of 20 case studies (75%) evaluate without benchmarks, relying on A/B testing, user feedback, and production monitoring.",
    356       "supported": "strong"
    357     }
    358   ],
    359   "methodology_tags": [
    360     "qualitative",
    361     "observational"
    362   ],
    363   "key_findings": "MAP is the first large-scale study of production AI agents, combining 20 in-depth case studies and 306 practitioner surveys across 26 domains. Production agents favor simplicity and control: 70% use off-the-shelf models without fine-tuning, 68% execute ≤10 steps before human intervention, and 74% rely on human-in-the-loop evaluation. Reliability remains the top development challenge, addressed through systems-level design (read-only modes, sandboxing, constrained autonomy) rather than algorithmic advances. The study reveals a significant gap between research methods (emphasizing autonomy and RL) and production practices (favoring controllability and prompting).",
    364   "red_flags": [
    365     {
    366       "flag": "Recruitment bias",
    367       "detail": "Snowball sampling from UC Berkeley professional networks and specific events (Berkeley RDI Summit, AI Alliance Meetup) likely overrepresents Silicon Valley perspectives and organizations connected to the research group. The paper acknowledges this but the geographic and network concentration is substantial."
    368     },
    369     {
    370       "flag": "Non-independent funders",
    371       "detail": "Multiple funders (Amazon, Google, IBM, Intesa Sanpaolo) are major AI agent deployers with commercial interest in positive findings about agent adoption. Intesa Sanpaolo has both co-authors and funder status. No conflicts of interest statement is provided."
    372     },
    373     {
    374       "flag": "Small effective N for key claims",
    375       "detail": "Despite 306 total survey responses, after filtering to deployed agents many key findings rest on small N: N=31 for evaluation methods, N=29 for challenges, N=22 for number of models. Case study claims rest on N=20. Some percentages (e.g., 85% custom implementations) come from very small samples presented as strong findings."
    376     },
    377     {
    378       "flag": "No IRB/ethics review mentioned",
    379       "detail": "The study collected data from human participants through interviews and surveys without mentioning IRB or ethics board approval. This is a notable omission for a study published from a major research university."
    380     },
    381     {
    382       "flag": "Survivorship bias",
    383       "detail": "The study focuses on 'successful' production deployments by filtering to production/pilot systems. Failed or abandoned agent projects are largely excluded, potentially overrepresenting practices that appear successful due to selection."
    384     },
    385     {
    386       "flag": "Tension between interview and survey data",
    387       "detail": "Survey data shows 61% use frameworks, but case studies show 85% use custom implementations. The paper favors the case study finding in its narrative (Finding 10) without fully reconciling this contradiction."
    388     }
    389   ],
    390   "cited_papers": [
    391     {
    392       "title": "A survey on large language model based autonomous agents",
    393       "authors": ["Lei Wang", "Chen Ma", "Xueyang Feng"],
    394       "year": 2024,
    395       "relevance": "Foundational survey on LLM-based agent architectures and taxonomies, directly relevant to understanding the research landscape this paper characterizes."
    396     },
    397     {
    398       "title": "Survey on evaluation of LLM-based agents",
    399       "authors": ["Asaf Yehudai", "Lilach Eden"],
    400       "year": 2025,
    401       "arxiv_id": "2503.16416",
    402       "relevance": "Specialized survey on agent evaluation methods, complementary to MAP's empirical findings on evaluation practices in production."
    403     },
    404     {
    405       "title": "The emerged security and privacy of LLM agent: A survey with case studies",
    406       "authors": ["Feng He", "Tianqing Zhu"],
    407       "year": 2025,
    408       "doi": "10.1145/3773080",
    409       "relevance": "Survey on LLM agent security, relevant to this survey's findings that security is a secondary concern in current deployments."
    410     },
    411     {
    412       "title": "An illusion of progress? Assessing the current state of web agents",
    413       "authors": ["Tiange Xue"],
    414       "year": 2025,
    415       "arxiv_id": "2504.01382",
    416       "relevance": "Critical assessment of web agent capabilities that documents agent deployment failures, directly relevant to production-research gap identified by MAP."
    417     },
    418     {
    419       "title": "Why johnny can't use agents: Industry aspirations vs. user realities with AI agent software",
    420       "authors": ["Pranav Shome"],
    421       "year": 2025,
    422       "arxiv_id": "2509.14528",
    423       "relevance": "Studies capability gaps in agent software through user studies and marketing analysis, complementary industry perspective."
    424     },
    425     {
    426       "title": "DSPy: Compiling declarative language model calls into state-of-the-art pipelines",
    427       "authors": ["Omar Khattab"],
    428       "year": 2024,
    429       "relevance": "Prompt optimization framework mentioned as one of the rare automated prompting approaches used in production (9%)."
    430     },
    431     {
    432       "title": "ITBench: Evaluating AI agents across diverse real-world IT automation tasks",
    433       "authors": ["Saurabh Jha"],
    434       "year": 2025,
    435       "arxiv_id": "2502.05352",
    436       "relevance": "Benchmark for evaluating AI agents in IT automation, relevant to benchmark scarcity findings in production agent evaluation."
    437     },
    438     {
    439       "title": "The AI scientist: Towards fully automated open-ended scientific discovery",
    440       "authors": ["Cong Lu"],
    441       "year": 2024,
    442       "arxiv_id": "2408.06292",
    443       "relevance": "Agentic AI system for scientific discovery, represents research-side agent capabilities that contrast with production practices."
    444     },
    445     {
    446       "title": "Towards an AI co-scientist",
    447       "authors": ["Juraj Gottweis"],
    448       "year": 2025,
    449       "arxiv_id": "2502.18864",
    450       "relevance": "Single-system study of an AI agent for scientific discovery, exemplifies the case study literature MAP contrasts with."
    451     },
    452     {
    453       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    454       "authors": ["John Yang"],
    455       "year": 2024,
    456       "arxiv_id": "2405.15793",
    457       "relevance": "Prominent coding agent framework, relevant to software engineering agent deployment patterns documented in MAP."
    458     },
    459     {
    460       "title": "Advances and challenges in foundation agents: From brain-inspired intelligence to evolutionary, collaborative, and safe systems",
    461       "authors": ["Bo Liu"],
    462       "year": 2025,
    463       "arxiv_id": "2504.01990",
    464       "relevance": "Comprehensive survey of agent architectures providing research-side perspective that MAP's production data contrasts with."
    465     },
    466     {
    467       "title": "Safety at scale: A comprehensive survey of large model and agent safety",
    468       "authors": ["Xiangyu Ma"],
    469       "year": 2025,
    470       "arxiv_id": "2502.05206",
    471       "relevance": "Survey on AI agent safety, relevant to MAP's finding that security is currently addressed through system-level constraints rather than dedicated safety mechanisms."
    472     },
    473     {
    474       "title": "LlamaFirewall: An open source guardrail system for building secure AI agents",
    475       "authors": ["Sridhar Chennabasappa"],
    476       "year": 2025,
    477       "arxiv_id": "2505.03574",
    478       "relevance": "Open-source security tool for AI agents, relevant to production agent security practices documented in MAP."
    479     }
    480   ]
    481 }

Impressum · Datenschutz