scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (19931B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "The 2025 AI Agent Index: Documenting Technical and Safety Features of Deployed Agentic AI Systems",
      6     "authors": [
      7       "Leon Staufer",
      8       "Kevin Feng",
      9       "Kevin Wei",
     10       "Luke Bailey",
     11       "Yawen Duan",
     12       "Mick Yang",
     13       "A. Pinar Ozisik",
     14       "Stephen Casper",
     15       "Noam Kolt"
     16     ],
     17     "year": 2026,
     18     "venue": "arXiv",
     19     "arxiv_id": "2602.17753",
     20     "doi": null
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The abstract claims about documenting 30 agents, finding transparency gaps, and most developers sharing little safety information are all supported by detailed findings in Sections 4 and 6.",
     28         "source": "opus"
     29       },
     30       "causal_claims_justified": {
     31         "applies": false,
     32         "answer": false,
     33         "justification": "The paper makes descriptive claims about the current state of agent documentation. No causal claims are made.",
     34         "source": "opus"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Section 6.2 explicitly bounds the scope: 'Our inclusion criteria favor the most significant agents, which may affect generalizability. Public interest metrics favor consumer products over enterprise deployments. Domain-specific agents are excluded.'",
     40         "source": "opus"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper considers alternative explanations: Chinese agents' missing safety documentation 'may simply not be documented publicly' (Section 4.2). The discussion also notes that missing safety information could reflect internal practices not shared publicly rather than absence of safety work.",
     46         "source": "opus"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "This is a documentation survey of 30 AI agents. It makes no empirical measurements and claims no outcome — it catalogs publicly available information. No proxy-outcome gap exists because no measurement is used to represent a broader construct.",
     52         "source": "opus"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section 6.2 'Limitations and Outlook' provides a dedicated multi-paragraph discussion of limitations.",
     60         "source": "opus"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Section 6.2 discusses specific threats: 'Public interest metrics favor consumer products over enterprise deployments,' 'The Index relies exclusively on publicly available information, which may miss internal evaluations,' 'The Index relies on English and Chinese documentation and may miss information available in other languages.'",
     66         "source": "opus"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Section 6.2 explicitly states scope boundaries: domain-specific agents excluded, only publicly available and deployable agents included, presents a snapshot as of December 31, 2025, internal company agents remain opaque.",
     72         "source": "opus"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The Acknowledgments section states: 'This research was supported by the MATS Research program, which provided funding for L.S. and M.Y. through research stipends.'",
     80         "source": "opus"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Author affiliations are listed: University of Cambridge, University of Washington, Harvard Law School, Stanford, Concordia AI, UPenn, MIT, Hebrew University of Jerusalem. No authors are affiliated with the companies whose agents are indexed.",
     86         "source": "opus"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "MATS (ML Alignment Theory Scholars) is an AI safety research program with no direct financial interest in the evaluation outcome of any specific AI agent.",
     92         "source": "opus"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No competing interests or financial interests statement is present in the paper.",
     98         "source": "opus"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 3.1 operationalizes 'agent' through four measurable criteria (autonomy, goal complexity, environmental interaction, generality) with specific thresholds; autonomy levels L1-L5 from Feng et al. are explicitly defined and applied throughout.",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Three contributions are explicitly enumerated in Section 1: (1) Agent Index of 30 systems, (2) ecosystem-wide trends, (3) three illustrative case studies.",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 2 situates the work against the 2024 AI Agent Index (Casper et al.), Foundation Model Transparency Index, Princeton Holistic Agentic Leaderboard, and prior documentation frameworks (model cards, system cards, datasheets), explicitly noting how this work expands upon and differs from each.",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "survey": {
    124       "search_and_selection": {
    125         "search_strategy_reproducible": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "Candidate discovery relied on LLM-based queries (ChatGPT, Claude, Gemini with research mode) which are non-deterministic; the same prompts would not reliably produce the same 95-candidate list, making true reproduction impossible even though prompts are provided in Appendix B.5.",
    129           "source": "haiku"
    130         },
    131         "inclusion_exclusion_explicit": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "Section 3.1 and Figure 2 specify three criteria categories with explicit thresholds: all four agency criteria required, at least one of three impact criteria required (with quantitative cutoffs: 10,000 searches or $1B valuation), and all three practicality criteria required.",
    135           "source": "haiku"
    136         },
    137         "prisma_or_structured_protocol": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No PRISMA or equivalent structured review protocol is mentioned or followed; the paper uses a custom inclusion framework appropriate to an agent audit rather than a literature review.",
    141           "source": "haiku"
    142         },
    143         "search_terms_provided": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Appendix B.5 provides the full LLM prompts used to discover candidate agents, Appendix C.1 provides the search term generation prompts and criteria, and Figure 15 shows the full list of search terms evaluated by Ahrefs.",
    147           "source": "haiku"
    148         },
    149         "databases_listed": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "Sources listed include the 2024 AI Agent Index, Princeton Holistic Agent Leaderboard, AIAgentList.com, Ahrefs API for search volume, Crunchbase and Epoch AI for valuations, and stock exchanges for market capitalization.",
    153           "source": "haiku"
    154         },
    155         "screening_process_documented": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Section 3.3 documents the 95-candidate initial list, screening against inclusion criteria, ambiguous case handling, consultation with Chinese ecosystem experts, and cross-referencing; Appendix B.1 enumerates all 95 candidates with bold indicating inclusion.",
    159           "source": "haiku"
    160         },
    161         "review_scope_justified": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "The paper explains the rationale for focusing on 'highly agentic systems with high-impact real-world applications,' contrasts with the broader 2024 Index, and justifies the December 31, 2025 cutoff and the three agent categories based on interaction paradigm differences.",
    165           "source": "haiku"
    166         }
    167       },
    168       "synthesis_quality": {
    169         "conflicting_findings_acknowledged": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "The paper documents variation and tensions across agents: safety framework gaps between US and Chinese companies, the asymmetry between capability benchmarks and safety documentation, and the unresolved web conduct norms (robots.txt compliance vs. functionality).",
    173           "source": "haiku"
    174         },
    175         "quality_assessment_of_sources": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "While the paper assesses agent transparency quality, there is no systematic quality rubric or risk-of-bias assessment for the documentation sources themselves (official blog posts, help pages, demos); 'None found' vs. 'None' is a presence/absence distinction, not a quality weighting.",
    179           "source": "haiku"
    180         },
    181         "publication_bias_discussed": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Section 6.2 explicitly states 'The Index relies exclusively on publicly available information, which may miss internal evaluations or risk management practices,' and the ethical considerations note that 'None found' is distinguished from confirmed absence to avoid over-attributing transparency failures.",
    185           "source": "haiku"
    186         },
    187         "quantitative_synthesis_present": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "The paper provides systematic frequency counts across all 30 agents and 45 fields (e.g., '133/240 safety fields have no information,' '20/30 support MCP,' '3/30 have third-party testing'), with Figures 3, 6, 11, and 12 visualizing distribution patterns.",
    191           "source": "haiku"
    192         },
    193         "recommendations_supported_by_evidence": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Section 6.1 ties all observations directly to the annotation data (e.g., the 'safety washing' observation is grounded in the 133/240 missing safety fields and the 25/30 agents disclosing no internal safety results), and future directions track the documented gaps.",
    197           "source": "haiku"
    198         }
    199       }
    200     }
    201   },
    202   "claims": [
    203     {
    204       "claim": "133 out of 240 safety-related annotation fields have no public information available.",
    205       "evidence": "Figure 6 and Section 4.6 document this systematically across all 30 agents; browser agents (67/104, 64%) and enterprise agents (25/40, 63%) have the most missing safety fields.",
    206       "supported": "strong"
    207     },
    208     {
    209       "claim": "Only 3/30 agents have documented third-party testing, and 25/30 disclose no internal safety results.",
    210       "evidence": "Section 4.6 states 'Third-party testing is documented for only 3/30 agents (Anthropic Claude, OpenAI ChatGPT, OpenAI Codex). 25/30 agents disclose no internal safety results.'",
    211       "supported": "strong"
    212     },
    213     {
    214       "claim": "Model Context Protocol (MCP) is the dominant interoperability standard, supported by 20/30 agents.",
    215       "evidence": "Section 4.3 and Figure 12 document MCP support counts per category; enterprise agents show 13/13 MCP support.",
    216       "supported": "strong"
    217     },
    218     {
    219       "claim": "23/30 agents are fully closed source, with only 7 open-sourcing their agent framework or harness.",
    220       "evidence": "Section 4.3 lists the 7 open-source agents by name (Alibaba MobileAgent, Browser Use, ByteDance Agent TARS, Google Gemini CLI, n8n Agents, OpenAI Codex, WRITER).",
    221       "supported": "strong"
    222     },
    223     {
    224       "claim": "Chinese-incorporated agent developers systematically lack publicly documented safety frameworks (1/5) and compliance standards (1/5) relative to US developers.",
    225       "evidence": "Section 4.2 and Figure 4b show 0% of Chinese companies (except Z.ai) publish AI safety frameworks, compared to 75% of US developers.",
    226       "supported": "strong"
    227     },
    228     {
    229       "claim": "Browser-based agents operate at the highest autonomy levels (L4-L5) with the most limited intervention mechanisms.",
    230       "evidence": "Section 4.4 and Figure 5 document browser agents at L4-L5; 'Browser Use's agent and Perplexity's Comet perform tasks autonomously once prompted, with no means for user involvement during execution.'",
    231       "supported": "strong"
    232     },
    233     {
    234       "claim": "Only 4 of 30 agents (ChatGPT Agent, OpenAI Codex, Claude Code, Gemini 2.5 Computer Use) provide agent-specific system cards.",
    235       "evidence": "Section 6.1 explicitly names these four, noting that others rely only on base model documentation.",
    236       "supported": "strong"
    237     }
    238   ],
    239   "methodology_tags": [
    240     "observational",
    241     "case-study",
    242     "qualitative"
    243   ],
    244   "key_findings": "The 2025 AI Agent Index documents 30 highly agentic deployed systems across 45 fields, finding pervasive transparency gaps: 133/240 safety-related fields had no public information, only 3/30 agents have third-party testing documentation, and only 4/30 provide agent-specific system cards. The ecosystem is heavily concentrated around three frontier model families (GPT, Claude, Gemini), creating systemic dependency risks. Browser-based agents operate at the highest autonomy levels (L4-L5) with the weakest safety documentation, while enterprise agent builders systematically delegate safety responsibility to end users. A transparency asymmetry exists where capability benchmarks are more frequently reported than safety evaluations, consistent with a weak form of 'safety washing.'",
    245   "red_flags": [
    246     {
    247       "flag": "Non-deterministic discovery",
    248       "detail": "Candidate agents were surfaced via LLM-based queries (ChatGPT, Claude, Gemini) which are inherently non-deterministic; the same prompts would not reliably reproduce the same 95-candidate pool, undermining reproducibility claims."
    249     },
    250     {
    251       "flag": "Low developer response rate",
    252       "detail": "Only 23% of contacted companies offered any response, and only 4/30 provided substantive corrections; many annotations may contain inaccuracies that were not verified by the companies being described."
    253     },
    254     {
    255       "flag": "Public-information-only constraint",
    256       "detail": "The entire index is based on publicly available documentation, which means transparency scores reflect communication quality rather than actual safety practices; companies with good marketing may outscore genuinely safer but less vocal companies."
    257     },
    258     {
    259       "flag": "No competing interests declaration",
    260       "detail": "No formal competing interests statement is provided for all authors; several authors are associated with institutions (MIT, Harvard, Stanford) that have active relationships with some of the evaluated companies."
    261     },
    262     {
    263       "flag": "LLM-assisted annotation verification",
    264       "detail": "GPT-5.2 was used to cross-check annotations for factual accuracy; this introduces potential systematic bias if the model has knowledge gaps or biases about specific agents, particularly less prominent ones."
    265     }
    266   ],
    267   "cited_papers": [
    268     {
    269       "title": "The AI Agent Index (2024)",
    270       "relevance": "Direct predecessor paper; the 2025 Index expands upon its methodology and inclusion criteria for comparative context."
    271     },
    272     {
    273       "title": "The 2024 Foundation Model Transparency Index",
    274       "relevance": "Comparable documentation effort for foundation models; used as a developer significance criterion and methodological comparison."
    275     },
    276     {
    277       "title": "Harms from Increasingly Agentic Algorithmic Systems",
    278       "relevance": "Characterizes AI agent properties (autonomy, goal complexity, environmental interaction) that form the theoretical basis for the Index's inclusion criteria."
    279     },
    280     {
    281       "title": "Levels of Autonomy for AI Agents",
    282       "relevance": "Provides the L1-L5 autonomy framework used throughout the Index to classify agent autonomy levels."
    283     },
    284     {
    285       "title": "Visibility into AI Agents",
    286       "relevance": "Related work on agent oversight mechanisms and transparency, directly relevant to the Index's ecosystem interaction category."
    287     },
    288     {
    289       "title": "Characterizing AI Agents for Alignment and Governance",
    290       "relevance": "Provides theoretical grounding for agent characterization used in the inclusion criteria alongside Chan et al."
    291     },
    292     {
    293       "title": "Holistic Agent Leaderboard: The Missing Infrastructure for AI Agent Evaluation",
    294       "relevance": "Concurrent work on agent evaluation infrastructure; cross-referenced during candidate identification."
    295     },
    296     {
    297       "title": "Safetywashing: Do AI Safety Benchmarks Actually Measure Safety Progress?",
    298       "relevance": "Provides the 'safety washing' framing used in Section 6.1 to characterize the transparency asymmetry between capability and safety documentation."
    299     },
    300     {
    301       "title": "Infrastructure for AI Agents",
    302       "relevance": "Directly related work on web conduct norms and agent identification protocols discussed in Section 4.5 and 6.1."
    303     },
    304     {
    305       "title": "Accountability in an Algorithmic Society",
    306       "relevance": "Provides the accountability diffusion framing used to analyze fragmented responsibility across the agent ecosystem."
    307     }
    308   ],
    309   "engagement_factors": {
    310     "practical_relevance": {
    311       "score": 1,
    312       "justification": "Useful reference index for comparing AI agents but not a tool or technique practitioners can directly apply in their workflow."
    313     },
    314     "surprise_contrarian": {
    315       "score": 2,
    316       "justification": "The stark numbers — 25/30 agents disclose no safety results, only 4/30 have agent-specific system cards — and the 'safety washing' claim challenge the industry's safety narrative."
    317     },
    318     "fear_safety": {
    319       "score": 2,
    320       "justification": "Safety transparency gaps are a central theme with concrete data showing browser agents at L4-L5 autonomy lacking evaluations, plus documented prompt injection incidents."
    321     },
    322     "drama_conflict": {
    323       "score": 2,
    324       "justification": "Directly names companies with missing safety documentation, coins 'safety washing', and highlights Perplexity's robots.txt evasion and Amazon's legal threats."
    325     },
    326     "demo_ability": {
    327       "score": 2,
    328       "justification": "The full index is browsable at aiagentindex.mit.edu with structured data downloadable from Zenodo in JSON and CSV formats."
    329     },
    330     "brand_recognition": {
    331       "score": 3,
    332       "justification": "Covers ChatGPT, Claude, Gemini, and Copilot; authors from MIT, Cambridge, Harvard, and Stanford with an MIT-hosted website."
    333     }
    334   },
    335   "hn_data": {
    336     "threads": [
    337       {
    338         "hn_id": "47279778",
    339         "title": "Nested Training for Mutual Adaptation in Human-AI Teaming",
    340         "points": 2,
    341         "comments": 0,
    342         "url": "https://news.ycombinator.com/item?id=47279778",
    343         "created_at": "2026-03-06T19:21:19Z"
    344       }
    345     ],
    346     "top_points": 2,
    347     "total_points": 2,
    348     "total_comments": 0
    349   }
    350 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs