scan.json (24194B)
1 { 2 "paper": { 3 "title": "The 2025 AI Agent Index: Documenting Technical and Safety Features of Deployed Agentic AI Systems", 4 "authors": [ 5 "Leon Staufer", 6 "Kevin Feng", 7 "Kevin Wei", 8 "Luke Bailey", 9 "Yawen Duan", 10 "Mick Yang", 11 "A. Pinar Ozisik", 12 "Stephen Casper", 13 "Noam Kolt" 14 ], 15 "year": 2026, 16 "venue": "arXiv", 17 "arxiv_id": "2602.17753" 18 }, 19 "scan_version": 3, 20 "active_modules": [ 21 "survey_methodology" 22 ], 23 "methodology_tags": [ 24 "meta-analysis", 25 "qualitative" 26 ], 27 "key_findings": "The 2025 AI Agent Index documents 30 deployed agentic AI systems across 1,350 annotation fields, finding significant transparency gaps especially around safety. Only 4/30 agents have agent-specific system cards, 25/30 disclose no internal safety results, and 23/30 have no third-party testing. The ecosystem concentrates on a few foundation models (GPT, Claude, Gemini), creating single points of failure. Browser agents operate at the highest autonomy levels (L4-L5) while often ignoring robots.txt and lacking safety evaluations.", 28 "claims": [ 29 { 30 "claim": "Most agent developers share little information about safety, evaluations, and societal impacts — 133/240 safety-related fields have no information available.", 31 "evidence": "Section 4.6 and Figure 6 document missing safety fields across all 30 agents, broken down by agent category (browser 64%, enterprise 63%, chat 43%).", 32 "supported": "strong" 33 }, 34 { 35 "claim": "Only 4/30 agents provide agent-specific system cards (ChatGPT Agent, OpenAI Codex, Claude Code, Gemini 2.5 Computer Use).", 36 "evidence": "Section 4.6 explicitly lists these four agents as having dedicated agent-specific system cards.", 37 "supported": "strong" 38 }, 39 { 40 "claim": "A transparency asymmetry exists suggesting a weaker form of 'safety washing' where high-level frameworks mask selective empirical disclosure.", 41 "evidence": "Section 6.1 discusses this pattern but the characterization as 'safety washing' is interpretive rather than empirically demonstrated.", 42 "supported": "moderate" 43 }, 44 { 45 "claim": "Browser-based agents operate at L4-L5 autonomy with limited mid-execution intervention.", 46 "evidence": "Section 4.4 documents autonomy levels for all 30 agents with specific examples (Perplexity Comet, Browser Use).", 47 "supported": "strong" 48 }, 49 { 50 "claim": "20/30 agents support MCP for tool integration.", 51 "evidence": "Section 4.5 states this finding directly based on the annotation data.", 52 "supported": "strong" 53 } 54 ], 55 "red_flags": [ 56 { 57 "flag": "No quality assessment of source documentation", 58 "detail": "The Index documents what information is publicly available but does not assess the quality or accuracy of the documentation it catalogues. Safety frameworks are noted as present/absent without evaluating their substance." 59 }, 60 { 61 "flag": "Inclusion criteria favor well-funded entities", 62 "detail": "The market significance criterion (≥$1B valuation) and public interest thresholds (10K searches, 20K GitHub stars) systematically exclude smaller or emerging agents. The authors acknowledge this in limitations." 63 }, 64 { 65 "flag": "LLM-assisted candidate discovery", 66 "detail": "Initial candidate agents were surfaced using ChatGPT 5.2, Claude Sonnet 4.5, and Gemini 2.5 with research mode. While human experts made final decisions, this could bias toward agents these models know about." 67 } 68 ], 69 "checklist": { 70 "artifacts": { 71 "code_released": { 72 "applies": true, 73 "answer": true, 74 "justification": "The full annotations are released in JSON and CSV format on Zenodo (doi:10.5281/zenodo.18701931) and the Index is available at https://aiagentindex.mit.edu. The Generative AI Usage Statement mentions code for visualizations but does not mention releasing analysis code separately." 75 }, 76 "data_released": { 77 "applies": true, 78 "answer": true, 79 "justification": "Section A states 'The full annotations for all fields are available in JSON and CSV format on Zenodo at: https://doi.org/10.5281/zenodo.18701931'." 80 }, 81 "environment_specified": { 82 "applies": true, 83 "answer": false, 84 "justification": "No environment specifications, requirements.txt, or dependency information is provided. The paper mentions using Python for visualizations but provides no setup details." 85 }, 86 "reproduction_instructions": { 87 "applies": true, 88 "answer": false, 89 "justification": "No step-by-step reproduction instructions are provided. The annotation methodology is described in Section 3.4 and appendices, but there are no scripts or detailed procedures to reproduce the annotation process." 90 } 91 }, 92 "statistical_methodology": { 93 "confidence_intervals_or_error_bars": { 94 "applies": false, 95 "answer": false, 96 "justification": "This is a documentation/survey paper reporting descriptive counts (e.g., 20/30 agents support MCP). No inferential statistics are used." 97 }, 98 "significance_tests": { 99 "applies": false, 100 "answer": false, 101 "justification": "The paper makes no comparative statistical claims requiring significance tests. All findings are descriptive counts and proportions." 102 }, 103 "effect_sizes_reported": { 104 "applies": false, 105 "answer": false, 106 "justification": "No effect sizes are applicable — the paper reports descriptive proportions from a documentation exercise, not experimental comparisons." 107 }, 108 "sample_size_justified": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 3.1 provides detailed inclusion criteria (agency, impact, practicality) that explain why 30 agents were selected from 95 candidates. The criteria are well-justified even if the resulting sample is not statistically representative." 112 }, 113 "variance_reported": { 114 "applies": false, 115 "answer": false, 116 "justification": "No experimental runs or repeated measurements are conducted. The paper documents factual properties of agents." 117 } 118 }, 119 "evaluation_design": { 120 "baselines_included": { 121 "applies": true, 122 "answer": true, 123 "justification": "The paper explicitly compares against the inaugural 2024 AI Agent Index (Casper et al. [22]), noting revised inclusion criteria, expanded fields, and different response rates from developers." 124 }, 125 "baselines_contemporary": { 126 "applies": true, 127 "answer": true, 128 "justification": "The 2024 AI Agent Index [22] is the most recent comparable work. The paper also references the Princeton Holistic Agentic Leaderboard [67] and AIAgentList.com [4] as concurrent efforts." 129 }, 130 "ablation_study": { 131 "applies": false, 132 "answer": false, 133 "justification": "This is a documentation survey, not a system with components to ablate." 134 }, 135 "multiple_metrics": { 136 "applies": false, 137 "answer": false, 138 "justification": "The paper is a documentation exercise, not an evaluation requiring metrics." 139 }, 140 "human_evaluation": { 141 "applies": false, 142 "answer": false, 143 "justification": "Human evaluation of system outputs is not applicable to this documentation survey." 144 }, 145 "held_out_test_set": { 146 "applies": false, 147 "answer": false, 148 "justification": "No test set is used — this is a survey/documentation paper." 149 }, 150 "per_category_breakdown": { 151 "applies": true, 152 "answer": true, 153 "justification": "Results are consistently broken down by agent category (chat, browser, enterprise) throughout Section 4 and in Figures 3, 5, 6, and 12." 154 }, 155 "failure_cases_discussed": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 6.2 discusses limitations of the Index methodology including coverage biases, reliance on public information, language limitations, and potential for missing qualifying systems." 159 }, 160 "negative_results_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "The paper reports negative findings extensively: low developer response rates (23%), missing safety documentation (133/240 fields), lack of third-party testing (23/30), and the 'safety washing' pattern." 164 } 165 }, 166 "claims_and_evidence": { 167 "abstract_claims_supported": { 168 "applies": true, 169 "answer": true, 170 "justification": "The abstract claims about documenting 30 agents, finding transparency gaps, and most developers sharing little safety information are all supported by detailed findings in Sections 4 and 6." 171 }, 172 "causal_claims_justified": { 173 "applies": false, 174 "answer": false, 175 "justification": "The paper makes descriptive claims about the current state of agent documentation. No causal claims are made." 176 }, 177 "generalization_bounded": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section 6.2 explicitly bounds the scope: 'Our inclusion criteria favor the most significant agents, which may affect generalizability. Public interest metrics favor consumer products over enterprise deployments. Domain-specific agents are excluded.'" 181 }, 182 "alternative_explanations_discussed": { 183 "applies": true, 184 "answer": true, 185 "justification": "The paper considers alternative explanations: Chinese agents' missing safety documentation 'may simply not be documented publicly' (Section 4.2). The discussion also notes that missing safety information could reflect internal practices not shared publicly rather than absence of safety work." 186 }, 187 "proxy_outcome_distinction": { 188 "applies": false, 189 "answer": false, 190 "justification": "This is a documentation survey of 30 AI agents. It makes no empirical measurements and claims no outcome — it catalogs publicly available information. No proxy-outcome gap exists because no measurement is used to represent a broader construct." 191 } 192 }, 193 "setup_transparency": { 194 "model_versions_specified": { 195 "applies": false, 196 "answer": false, 197 "justification": "No models are evaluated in experiments. The Generative AI Usage Statement mentions models used for auxiliary tasks but these are not experimental." 198 }, 199 "prompts_provided": { 200 "applies": true, 201 "answer": true, 202 "justification": "The paper states 'detailed prompts in Section B.5' for the LLM-based candidate agent discovery process." 203 }, 204 "hyperparameters_reported": { 205 "applies": false, 206 "answer": false, 207 "justification": "No model experiments are conducted requiring hyperparameter reporting." 208 }, 209 "scaffolding_described": { 210 "applies": false, 211 "answer": false, 212 "justification": "No agentic scaffolding is used in the methodology." 213 }, 214 "data_preprocessing_documented": { 215 "applies": true, 216 "answer": true, 217 "justification": "Section 3 describes the full pipeline: LLM-based candidate surfacing (95 candidates) → screening against inclusion criteria → in-depth annotation → cross-validation → GPT-5.2 verification → developer review (4 weeks). Sections 3.1-3.4 and B.4-B.6 provide detailed criteria at each stage." 218 } 219 }, 220 "limitations_and_scope": { 221 "limitations_section_present": { 222 "applies": true, 223 "answer": true, 224 "justification": "Section 6.2 'Limitations and Outlook' provides a dedicated multi-paragraph discussion of limitations." 225 }, 226 "threats_to_validity_specific": { 227 "applies": true, 228 "answer": true, 229 "justification": "Section 6.2 discusses specific threats: 'Public interest metrics favor consumer products over enterprise deployments,' 'The Index relies exclusively on publicly available information, which may miss internal evaluations,' 'The Index relies on English and Chinese documentation and may miss information available in other languages.'" 230 }, 231 "scope_boundaries_stated": { 232 "applies": true, 233 "answer": true, 234 "justification": "Section 6.2 explicitly states scope boundaries: domain-specific agents excluded, only publicly available and deployable agents included, presents a snapshot as of December 31, 2025, internal company agents remain opaque." 235 } 236 }, 237 "data_integrity": { 238 "raw_data_available": { 239 "applies": true, 240 "answer": true, 241 "justification": "Full annotation data is released on Zenodo in JSON and CSV format (doi:10.5281/zenodo.18701931) and the Index is publicly available at https://aiagentindex.mit.edu." 242 }, 243 "data_collection_described": { 244 "applies": true, 245 "answer": true, 246 "justification": "Section 3.4 describes the data collection procedure in detail: annotation from public documentation, websites, demos, published papers, and governance documents. Seven subject-matter experts annotated according to category. All web sources were archived." 247 }, 248 "recruitment_methods_described": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants were recruited. The data source is publicly available documentation about AI agents." 252 }, 253 "data_pipeline_documented": { 254 "applies": true, 255 "answer": true, 256 "justification": "The pipeline is documented: 95 candidates surfaced via LLM queries → screened against inclusion criteria → annotated by 7 experts across categories → cross-validated with independent review → 37/1,350 discrepancies resolved → GPT-5.2 verification screening → 4-week developer review period." 257 } 258 }, 259 "conflicts_of_interest": { 260 "funding_disclosed": { 261 "applies": true, 262 "answer": true, 263 "justification": "The Acknowledgments section states: 'This research was supported by the MATS Research program, which provided funding for L.S. and M.Y. through research stipends.'" 264 }, 265 "affiliations_disclosed": { 266 "applies": true, 267 "answer": true, 268 "justification": "Author affiliations are listed: University of Cambridge, University of Washington, Harvard Law School, Stanford, Concordia AI, UPenn, MIT, Hebrew University of Jerusalem. No authors are affiliated with the companies whose agents are indexed." 269 }, 270 "funder_independent_of_outcome": { 271 "applies": true, 272 "answer": true, 273 "justification": "MATS (ML Alignment Theory Scholars) is an AI safety research program with no direct financial interest in the evaluation outcome of any specific AI agent." 274 }, 275 "financial_interests_declared": { 276 "applies": true, 277 "answer": false, 278 "justification": "No competing interests or financial interests statement is present in the paper." 279 } 280 }, 281 "contamination": { 282 "training_cutoff_stated": { 283 "applies": false, 284 "answer": false, 285 "justification": "This paper does not evaluate a pre-trained model's capability on any benchmark. It is a documentation survey of AI agents." 286 }, 287 "train_test_overlap_discussed": { 288 "applies": false, 289 "answer": false, 290 "justification": "Not applicable — no benchmark evaluation of model capabilities is performed." 291 }, 292 "benchmark_contamination_addressed": { 293 "applies": false, 294 "answer": false, 295 "justification": "Not applicable — no benchmark evaluation is performed." 296 } 297 }, 298 "human_studies": { 299 "pre_registered": { 300 "applies": false, 301 "answer": false, 302 "justification": "No human participants. The paper documents publicly available information about AI agents." 303 }, 304 "irb_or_ethics_approval": { 305 "applies": false, 306 "answer": false, 307 "justification": "No human participants." 308 }, 309 "demographics_reported": { 310 "applies": false, 311 "answer": false, 312 "justification": "No human participants." 313 }, 314 "inclusion_exclusion_criteria": { 315 "applies": false, 316 "answer": false, 317 "justification": "No human participants." 318 }, 319 "randomization_described": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants." 323 }, 324 "blinding_described": { 325 "applies": false, 326 "answer": false, 327 "justification": "No human participants." 328 }, 329 "attrition_reported": { 330 "applies": false, 331 "answer": false, 332 "justification": "No human participants." 333 } 334 }, 335 "cost_and_practicality": { 336 "inference_cost_reported": { 337 "applies": false, 338 "answer": false, 339 "justification": "This is a survey/documentation paper. Cost of the paper's own method is not applicable." 340 }, 341 "compute_budget_stated": { 342 "applies": false, 343 "answer": false, 344 "justification": "This is a survey/documentation paper. No significant compute was required beyond standard research tools." 345 } 346 }, 347 "survey_methodology": { 348 "prisma_or_structured_protocol": { 349 "applies": true, 350 "answer": true, 351 "justification": "Section 3 describes a structured protocol: explicit inclusion criteria (agency, impact, practicality — Figure 2), systematic candidate identification via LLM queries and cross-referencing against multiple databases (Section 3.3), detailed annotation protocols developed through calibration exercises (Section B.4), and inter-annotator consistency measures." 352 }, 353 "quality_assessment_of_sources": { 354 "applies": true, 355 "answer": false, 356 "justification": "The Index documents what information exists but does not assess the quality of the documentation it catalogues. Safety frameworks are recorded as present/absent without evaluating their rigor or substance." 357 }, 358 "publication_bias_discussed": { 359 "applies": true, 360 "answer": true, 361 "justification": "The paper explicitly discusses information availability bias: 'The Index relies exclusively on publicly available information, which may miss internal evaluations or risk management practices' (Section 6.2). It also distinguishes 'None found' from 'None' to address the bias of equating absence of public information with absence of practice." 362 } 363 } 364 }, 365 "cited_papers": [ 366 { 367 "title": "The AI Agent Index", 368 "authors": [ 369 "Stephen Casper", 370 "Luke Bailey", 371 "Rosco Hunter", 372 "Carson Ezell" 373 ], 374 "year": 2025, 375 "arxiv_id": "2502.01635", 376 "relevance": "Inaugural version of the AI Agent Index — directly comparable prior work on documenting deployed agentic AI systems." 377 }, 378 { 379 "title": "Harms from Increasingly Agentic Algorithmic Systems", 380 "authors": [ 381 "Alan Chan", 382 "Rebecca Salganik", 383 "Alva Markelius" 384 ], 385 "year": 2023, 386 "relevance": "Defines agency properties used in the Index's inclusion criteria and characterizes risks from agentic systems." 387 }, 388 { 389 "title": "Infrastructure for AI Agents", 390 "authors": [ 391 "Alan Chan", 392 "Kevin Wei", 393 "Sihao Huang" 394 ], 395 "year": 2025, 396 "relevance": "Discusses infrastructure and visibility challenges for AI agents, directly relevant to transparency and governance findings." 397 }, 398 { 399 "title": "AgentHarm: A Benchmark for Measuring Harmfulness of LLM Agents", 400 "authors": [ 401 "Maksym Andriushchenko", 402 "Alexandra Souly" 403 ], 404 "year": 2025, 405 "relevance": "Benchmark for evaluating harmful capabilities of LLM agents, relevant to agent safety evaluation." 406 }, 407 { 408 "title": "Safetywashing: Do AI Safety Benchmarks Actually Measure Safety Progress?", 409 "authors": [ 410 "Richard Ren", 411 "Steven Basart", 412 "Adam Khoja" 413 ], 414 "year": 2024, 415 "relevance": "Directly relevant to the paper's finding about 'safety washing' patterns in agent documentation." 416 }, 417 { 418 "title": "Levels of Autonomy for AI Agents", 419 "authors": [ 420 "K. J. Kevin Feng", 421 "David W. McDonald", 422 "Amy X. Zhang" 423 ], 424 "year": 2025, 425 "relevance": "Provides the L1-L5 autonomy framework used throughout the Index to characterize agent autonomy levels." 426 }, 427 { 428 "title": "Security Challenges in AI Agent Deployment: Insights from a Large Scale Public Competition", 429 "authors": [ 430 "Andy Zou", 431 "Maxwell Lin", 432 "Eliot Jones" 433 ], 434 "year": 2025, 435 "relevance": "Large-scale evaluation of security challenges in AI agent deployment, relevant to agent safety benchmarking." 436 }, 437 { 438 "title": "OpenAgentSafety: A Comprehensive Framework for Evaluating Real-World AI Agent Safety", 439 "authors": [ 440 "Sanidhya Vijayvargiya" 441 ], 442 "year": 2025, 443 "relevance": "Framework for evaluating real-world AI agent safety, directly relevant to the Index's safety evaluation findings." 444 }, 445 { 446 "title": "The 2025 Foundation Model Transparency Index", 447 "authors": [ 448 "Alexander Wan", 449 "Kevin Klyman", 450 "Sayash Kapoor" 451 ], 452 "year": 2025, 453 "relevance": "Comparable transparency documentation effort for foundation models that the Index references for developer significance criteria." 454 }, 455 { 456 "title": "BrowseSafe: Understanding and Preventing Prompt Injection Within AI Browser Agents", 457 "authors": [ 458 "Kaiyuan Zhang", 459 "Mark Tenenholtz" 460 ], 461 "year": 2025, 462 "relevance": "Addresses prompt injection in browser agents, directly relevant to the security incidents documented in the Index." 463 }, 464 { 465 "title": "Holistic Agent Leaderboard: The Missing Infrastructure for AI Agent Evaluation", 466 "authors": [ 467 "Sayash Kapoor", 468 "Benedikt Stroebl" 469 ], 470 "year": 2025, 471 "relevance": "Concurrent work curating evaluations of agentic AI systems across benchmarks, complementary to the Index approach." 472 }, 473 { 474 "title": "Characterizing AI Agents for Alignment and Governance", 475 "authors": [ 476 "Atoosa Kasirzadeh", 477 "Iason Gabriel" 478 ], 479 "year": 2025, 480 "relevance": "Provides characterization framework for AI agents used in the Index's agency inclusion criteria." 481 } 482 ], 483 "engagement_factors": { 484 "practical_relevance": { 485 "score": 1, 486 "justification": "Useful reference index for comparing AI agents but not a tool or technique practitioners can directly apply in their workflow." 487 }, 488 "surprise_contrarian": { 489 "score": 2, 490 "justification": "The stark numbers — 25/30 agents disclose no safety results, only 4/30 have agent-specific system cards — and the 'safety washing' claim challenge the industry's safety narrative." 491 }, 492 "fear_safety": { 493 "score": 2, 494 "justification": "Safety transparency gaps are a central theme with concrete data showing browser agents at L4-L5 autonomy lacking evaluations, plus documented prompt injection incidents." 495 }, 496 "drama_conflict": { 497 "score": 2, 498 "justification": "Directly names companies with missing safety documentation, coins 'safety washing', and highlights Perplexity's robots.txt evasion and Amazon's legal threats." 499 }, 500 "demo_ability": { 501 "score": 2, 502 "justification": "The full index is browsable at aiagentindex.mit.edu with structured data downloadable from Zenodo in JSON and CSV formats." 503 }, 504 "brand_recognition": { 505 "score": 3, 506 "justification": "Covers ChatGPT, Claude, Gemini, and Copilot; authors from MIT, Cambridge, Harvard, and Stanford with an MIT-hosted website." 507 } 508 } 509 }