scan-v5.json (19837B)
1 { 2 "scan_version": 5, 3 "paper_type": "survey", 4 "paper": { 5 "title": "The 2025 AI Agent Index: Documenting Technical and Safety Features of Deployed Agentic AI Systems", 6 "authors": [ 7 "Leon Staufer", 8 "Kevin Feng", 9 "Kevin Wei", 10 "Luke Bailey", 11 "Yawen Duan", 12 "Mick Yang", 13 "A. Pinar Ozisik", 14 "Stephen Casper", 15 "Noam Kolt" 16 ], 17 "year": 2026, 18 "venue": "arXiv", 19 "arxiv_id": "2602.17753", 20 "doi": null 21 }, 22 "checklist": { 23 "claims_and_evidence": { 24 "abstract_claims_supported": { 25 "applies": true, 26 "answer": true, 27 "justification": "All abstract claims are supported: the 30-agent index is delivered, transparency gaps are shown through Figure 3 (198/1350 fields 'None found'), and safety documentation absence is documented with 133/240 safety fields having no public information.", 28 "source": "haiku" 29 }, 30 "causal_claims_justified": { 31 "applies": false, 32 "answer": false, 33 "justification": "The paper is primarily descriptive documentation; it makes no causal claims about what causes safety gaps or transparency differences.", 34 "source": "haiku" 35 }, 36 "generalization_bounded": { 37 "applies": true, 38 "answer": true, 39 "justification": "The paper explicitly scopes claims to 30 publicly available general-purpose agents as of December 31, 2025, and Section 6.2 acknowledges this may not generalize to internal deployments or domain-specific agents.", 40 "source": "haiku" 41 }, 42 "alternative_explanations_discussed": { 43 "applies": true, 44 "answer": true, 45 "justification": "The paper explicitly notes that Chinese companies lacking documented safety frameworks 'may simply not be documented publicly,' and consistently distinguishes 'None found' from 'None' to acknowledge that absence of public evidence is not evidence of absence.", 46 "source": "haiku" 47 }, 48 "proxy_outcome_distinction": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper consistently distinguishes between publicly available documentation (what is measured) and actual safety practices (what is claimed to be opaque), using the 'None found' vs 'None' distinction throughout.", 52 "source": "haiku" 53 } 54 }, 55 "limitations_and_scope": { 56 "limitations_section_present": { 57 "applies": true, 58 "answer": true, 59 "justification": "Section 6.2 'Limitations and Outlook' is a dedicated section covering methodology, scope, generalizability, and language coverage.", 60 "source": "haiku" 61 }, 62 "threats_to_validity_specific": { 63 "applies": true, 64 "answer": true, 65 "justification": "Specific threats named include: inclusion criteria favoring significant agents (reducing generalizability), public-interest metrics favoring consumer over enterprise products, reliance solely on English and Chinese documentation, and exclusive use of publicly available information missing internal practices.", 66 "source": "haiku" 67 }, 68 "scope_boundaries_stated": { 69 "applies": true, 70 "answer": true, 71 "justification": "The paper explicitly excludes domain-specific agents, company-internal products, limited pre-releases, and agents requiring software engineering expertise to deploy; scope is bounded to 30 publicly available general-purpose agents as of December 31, 2025.", 72 "source": "haiku" 73 } 74 }, 75 "conflicts_of_interest": { 76 "funding_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "The Acknowledgments section states: 'This research was supported by the MATS Research program, which provided funding for L.S. and M.Y. through research stipends.'", 80 "source": "haiku" 81 }, 82 "affiliations_disclosed": { 83 "applies": true, 84 "answer": true, 85 "justification": "All author affiliations are disclosed on the first page: Cambridge, UW, Harvard Law, Stanford, Concordia AI, UPenn, MIT (×2), and Hebrew University.", 86 "source": "haiku" 87 }, 88 "funder_independent_of_outcome": { 89 "applies": true, 90 "answer": true, 91 "justification": "MATS Research Program is an academic research fellowship program unrelated to the commercial AI agents being evaluated; no conflict between funder identity and the transparency findings reported.", 92 "source": "haiku" 93 }, 94 "financial_interests_declared": { 95 "applies": true, 96 "answer": false, 97 "justification": "There is no general competing interests or financial interests declaration for all authors; the only COI statement ('no conflicts of interest related to Anthropic or Claude Code') appears only in the Claude Code case study annotation, not as a paper-wide declaration.", 98 "source": "haiku" 99 } 100 }, 101 "scope_and_framing": { 102 "key_terms_defined": { 103 "applies": true, 104 "answer": true, 105 "justification": "Section 2 discusses the definition of 'agent' extensively drawing on prior literature; Section 3.1 operationalizes agency via four criteria (autonomy, goal complexity, environmental interaction, generality); autonomy levels L1-L5 are defined via Feng et al.", 106 "source": "haiku" 107 }, 108 "intended_contribution_clear": { 109 "applies": true, 110 "answer": true, 111 "justification": "Three contributions are explicitly enumerated in Section 1: (1) the Agent Index of 30 systems across 45 fields, (2) ecosystem-wide trends, and (3) three case studies of distinct agent categories.", 112 "source": "haiku" 113 }, 114 "engagement_with_prior_work": { 115 "applies": true, 116 "answer": true, 117 "justification": "Section 2 engages substantively with the 2024 AI Agent Index (predecessor), the Princeton Holistic Agentic Leaderboard, Foundation Model Transparency Index, and a range of documentation frameworks (datasheets, model cards, system cards, factsheets), situating this work relative to each.", 118 "source": "haiku" 119 } 120 } 121 }, 122 "type_checklist": { 123 "survey": { 124 "search_and_selection": { 125 "search_strategy_reproducible": { 126 "applies": true, 127 "answer": false, 128 "justification": "Initial candidate discovery used LLM-based queries (ChatGPT 5.2, Claude Sonnet 4.5, Gemini 2.5 with research mode); Section B.5 provides the prompts but LLM outputs are non-deterministic and a re-run would not produce the same 95-candidate list.", 129 "source": "haiku" 130 }, 131 "inclusion_exclusion_explicit": { 132 "applies": true, 133 "answer": true, 134 "justification": "Figure 2 and Section 3.1 specify explicit, operationalized inclusion criteria with quantitative thresholds: ≥10,000 searches or ≥20,000 GitHub stars, ≥$1B valuation, plus all-required agency and practicality sub-criteria.", 135 "source": "haiku" 136 }, 137 "prisma_or_structured_protocol": { 138 "applies": true, 139 "answer": false, 140 "justification": "No mention of PRISMA or any other established systematic review protocol; the paper follows a custom inclusion framework.", 141 "source": "haiku" 142 }, 143 "search_terms_provided": { 144 "applies": true, 145 "answer": false, 146 "justification": "Section C.1 provides LLM prompts used to generate search terms but does not enumerate the final list of actual search terms used per agent; terms were LLM-generated and not explicitly listed.", 147 "source": "haiku" 148 }, 149 "databases_listed": { 150 "applies": true, 151 "answer": true, 152 "justification": "Sources explicitly named: Ahrefs API (search volume), Google Scholar (paper counts), Yahoo Finance/Crunchbase/Epoch AI (market cap), GitHub (stars), and cross-references with 2024 AI Agent Index, Princeton Holistic Agent Leaderboard, and AIAgentList.com.", 153 "source": "haiku" 154 }, 155 "screening_process_documented": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 3.3 documents the pipeline: LLM queries surfaced 95 candidates → screened against inclusion criteria → ambiguous cases annotated in depth → final inclusion decisions → 30 agents included; key stage counts are provided.", 159 "source": "haiku" 160 }, 161 "review_scope_justified": { 162 "applies": true, 163 "answer": true, 164 "justification": "The paper explains the rationale for focusing on 'highly agentic systems with high-impact real-world applications' publicly available as of December 31, 2025, and explicitly justifies why domain-specific and internal agents are excluded.", 165 "source": "haiku" 166 } 167 }, 168 "synthesis_quality": { 169 "conflicting_findings_acknowledged": { 170 "applies": true, 171 "answer": true, 172 "justification": "The paper systematically acknowledges systematic differences across categories (frontier labs vs. enterprise platforms on safety evaluation, Chinese vs. US governance documentation patterns) and reports 37 inter-annotator discrepancies resolved through discussion.", 173 "source": "haiku" 174 }, 175 "quality_assessment_of_sources": { 176 "applies": true, 177 "answer": false, 178 "justification": "There is no formal quality rubric or risk-of-bias assessment for the agents or their source documentation; annotations distinguish 'None found' vs 'None' but do not score the reliability or rigor of what is documented.", 179 "source": "haiku" 180 }, 181 "publication_bias_discussed": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section 6.2 explicitly acknowledges that significance criteria 'favor well-funded companies and established products, potentially disadvantaging emerging developers and regional innovations,' and that exclusive reliance on public information may miss internal safety practices — the documentation-index equivalent of publication bias.", 185 "source": "haiku" 186 }, 187 "quantitative_synthesis_present": { 188 "applies": true, 189 "answer": true, 190 "justification": "The paper provides systematic counts and percentages throughout: 20/30 support MCP, 15/30 reference safety frameworks, 133/240 safety fields missing, 23/30 fully closed source, 3/30 with third-party testing — constituting quantitative vote-counting synthesis.", 191 "source": "haiku" 192 }, 193 "recommendations_supported_by_evidence": { 194 "applies": true, 195 "answer": true, 196 "justification": "Recommendations such as structured reporting requirements and evaluation targeting deployed tools rather than base models are directly grounded in documented gaps (133/240 safety fields missing, only 4 agent-specific system cards).", 197 "source": "haiku" 198 } 199 } 200 } 201 }, 202 "claims": [ 203 { 204 "claim": "Most developers share little information about safety, evaluations, and societal impacts: 133/240 safety-related fields have no public information.", 205 "evidence": "Figure 6 and Section 4.6 document 133/240 safety fields across 30 agents as 'None found'; browser (64%) and enterprise (63%) agents have the highest missing rates.", 206 "supported": "strong" 207 }, 208 { 209 "claim": "Only 3/30 agents have documented third-party testing; 25/30 disclose no internal safety results.", 210 "evidence": "Section 4.6: 'Third-party testing is documented for only 3/30 agents (Anthropic Claude, OpenAI ChatGPT, OpenAI Codex). 25/30 agents disclose no internal safety results.'", 211 "supported": "strong" 212 }, 213 { 214 "claim": "Chinese-developed agents have substantially lower safety documentation than US agents: 1/5 with safety frameworks vs. ~75% of US agents.", 215 "evidence": "Figure 4b shows Chinese companies at 20% for AI Safety Framework and 20% for Compliance Standards, compared to 76% and 95% for US companies.", 216 "supported": "strong" 217 }, 218 { 219 "claim": "Model Context Protocol has become the dominant interoperability standard, supported by 20/30 agents including all 13 enterprise platforms.", 220 "evidence": "Section 4.5 and Figure 12: '20/30 agents explicitly support MCP'; all 13 enterprise agents support MCP versus 4/12 chat agents.", 221 "supported": "strong" 222 }, 223 { 224 "claim": "Most agents (21/30) do not disclose their AI nature to end users or third parties by default, and only 3/30 support media watermarking.", 225 "evidence": "Section 4.5: '21/30 agents have no documented default disclosure behavior. Only 3/30 agents support watermarking generated media (e.g., through SynthID and C2PA).'", 226 "supported": "strong" 227 }, 228 { 229 "claim": "Browser-based agents frequently bypass robots.txt and anti-bot measures, with some explicitly marketing this capability.", 230 "evidence": "Section 4.5 and 5.2: BrowserUse 'explicitly markets bypassing anti-bot systems'; Cloudflare documented Perplexity using undeclared Chrome-signature crawlers; only 6/30 agents explicitly state robots.txt compliance.", 231 "supported": "strong" 232 }, 233 { 234 "claim": "The ecosystem exhibits concentrated model dependency: almost all non-frontier-lab agents rely on GPT, Claude, or Gemini model families.", 235 "evidence": "Section 4.3 and 6.1: 'Only frontier labs themselves (Anthropic, Google, OpenAI) and Chinese developers run their own proprietary models; the majority rely primarily on GPT, Claude, or Gemini model families.'", 236 "supported": "strong" 237 } 238 ], 239 "methodology_tags": [ 240 "observational", 241 "case-study", 242 "qualitative" 243 ], 244 "key_findings": "The 2025 AI Agent Index documents 45 fields across 30 deployed AI agents and finds widespread transparency gaps: 133/240 safety-related fields have no public information, only 3/30 agents have third-party testing documentation, and only 4/30 have agent-specific system cards. Chinese developers show markedly lower safety documentation than US counterparts (1/5 vs. ~75% with safety frameworks). The ecosystem is structurally concentrated around GPT/Claude/Gemini models, creating shared dependency risks. Browser agents operating at L4–L5 autonomy present the highest risk profile with the least safety documentation, and most agents bypass web conduct standards like robots.txt with active justifications from developers.", 245 "red_flags": [ 246 { 247 "flag": "Non-reproducible search", 248 "detail": "Initial candidate discovery used LLM-based queries (ChatGPT 5.2, Claude Sonnet 4.5, Gemini 2.5) whose outputs are non-deterministic; a re-run with the same prompts would yield a different candidate list, undermining reproducibility of the 95-agent starting pool." 249 }, 250 { 251 "flag": "Low developer response rate", 252 "detail": "Only 23% of developers offered any response to annotation review requests, and only 4/30 provided substantive corrections; findings likely substantially undercount internal safety practices that exist but are not published." 253 }, 254 { 255 "flag": "Conflates non-disclosure with absence", 256 "detail": "While the paper distinguishes 'None found' from 'None,' the headline finding that '133/240 safety fields have no information' conflates genuine absence of safety measures with non-disclosure of existing ones, which the paper's own methodology acknowledges is ambiguous." 257 }, 258 { 259 "flag": "Small and biased sample", 260 "detail": "30 agents selected by significance criteria that favor consumer-facing products and US/China companies; excludes domain-specific agents, internal deployments, and less prominent developers, making ecosystem-wide conclusions speculative." 261 } 262 ], 263 "cited_papers": [ 264 { 265 "title": "The AI Agent Index (Casper et al., 2025)", 266 "relevance": "Predecessor 2024 index that this work directly updates and expands, including revised inclusion criteria and annotation fields" 267 }, 268 { 269 "title": "Visibility into AI Agents (Chan et al., 2024)", 270 "relevance": "Prior work on transparency and documentation of AI agent systems, foundational to the Index's motivation and framing" 271 }, 272 { 273 "title": "The 2024 Foundation Model Transparency Index (Bommasani et al., 2024)", 274 "relevance": "Comparable documentation effort for foundation models; used as an inclusion criterion for developer significance" 275 }, 276 { 277 "title": "Levels of Autonomy for AI Agents (Feng et al., 2025)", 278 "relevance": "Provides the L1–L5 autonomy framework used throughout the Index to characterize and compare agent autonomy levels" 279 }, 280 { 281 "title": "Harms from Increasingly Agentic Algorithmic Systems (Chan et al., 2023)", 282 "relevance": "Defines the agency criteria (autonomy, goal complexity, environmental interaction, generality) directly adopted for inclusion criteria" 283 }, 284 { 285 "title": "Holistic Agent Leaderboard (Kapoor et al., 2025)", 286 "relevance": "Concurrent work documenting agentic AI systems across capability benchmarks; used for cross-referencing agent candidates" 287 }, 288 { 289 "title": "Safetywashing: Do AI Safety Benchmarks Actually Measure Safety Progress? (Ren et al., 2024)", 290 "relevance": "Provides theoretical grounding for the 'safety-washing' concept applied to the transparency asymmetry observed between capability benchmarks and safety documentation" 291 }, 292 { 293 "title": "Infrastructure for AI Agents (Chan et al., 2025)", 294 "relevance": "Discusses governance challenges for web-interacting agents, directly relevant to the robots.txt and web conduct findings" 295 } 296 ], 297 "engagement_factors": { 298 "practical_relevance": { 299 "score": 3, 300 "justification": "Directly usable by policymakers, procurement teams, and researchers to compare agents on safety and transparency; the online Index at aiagentindex.mit.edu is a live, downloadable resource." 301 }, 302 "surprise_contrarian": { 303 "score": 2, 304 "justification": "The finding that 133/240 safety fields have no public information and only 3/30 agents have third-party testing challenges industry safety-responsibility narratives, though the general direction is expected." 305 }, 306 "fear_safety": { 307 "score": 3, 308 "justification": "Concretely documents L4–L5 autonomy browser agents with prompt injection vulnerabilities, agents designed to bypass anti-bot systems, and absence of safety oversight for most deployed systems." 309 }, 310 "drama_conflict": { 311 "score": 2, 312 "justification": "Documents real legal disputes (Amazon vs. Perplexity, NYT vs. OpenAI, Reddit vs. Anthropic) and specific named prompt injection incidents against Perplexity Comet and Opera Neon." 313 }, 314 "demo_ability": { 315 "score": 3, 316 "justification": "The full Index is publicly available at aiagentindex.mit.edu in JSON and CSV formats, and all 30 documented agents are themselves publicly accessible products users can try." 317 }, 318 "brand_recognition": { 319 "score": 3, 320 "justification": "Covers Anthropic Claude, OpenAI ChatGPT, Google Gemini, Microsoft Copilot Studio, Salesforce Agentforce, and 25 other flagship products from the most recognized AI companies." 321 } 322 }, 323 "hn_data": { 324 "threads": [ 325 { 326 "hn_id": "47279778", 327 "title": "Nested Training for Mutual Adaptation in Human-AI Teaming", 328 "points": 2, 329 "comments": 0, 330 "url": "https://news.ycombinator.com/item?id=47279778", 331 "created_at": "2026-03-06T19:21:19Z" 332 } 333 ], 334 "top_points": 2, 335 "total_points": 2, 336 "total_comments": 0 337 } 338 }