scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24535B)
      1 {
      2   "paper": {
      3     "title": "AWCP: A Workspace Delegation Protocol for Deep-Engagement Collaboration across Remote Agents",
      4     "authors": [
      5       "Xiaohang Nie",
      6       "Zihan Guo",
      7       "Youliang Chen",
      8       "Yuanjian Zhou",
      9       "Weinan Zhang"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv preprint",
     13     "arxiv_id": "2602.20493"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The paper provides a GitHub link to the reference implementation: https://github.com/SII-Holos/awcp (mentioned in the abstract and Section 4.3)."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The demonstrations use specific datasets (e.g., a directory of 100+ images for cross-modal curation) but no dataset is released or linked. The paper does not provide the demonstration inputs or outputs."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper mentions the implementation is a TypeScript npm workspace monorepo using Vitest and Bun, but provides no requirements.txt, package.json details, Dockerfile, or specific version requirements for reproduction."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No step-by-step reproduction instructions are provided. The paper describes the architecture and implementation structure (Table 3) but does not include a README-style guide for replicating the demonstrations."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": false,
     41         "answer": false,
     42         "justification": "The paper reports no quantitative experimental results. The demonstrations are qualitative live scenarios, not statistical experiments."
     43       },
     44       "significance_tests": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "No comparative quantitative claims are made. The paper presents a protocol design with qualitative demonstrations, not statistical comparisons."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "No quantitative effect sizes are relevant. The paper does not report numerical performance metrics."
     53       },
     54       "sample_size_justified": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "No experiments with sample sizes are conducted. This is a protocol design paper with live demonstrations."
     58       },
     59       "variance_reported": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "No repeated experimental runs are conducted. The demonstrations are single executions of live collaboration scenarios."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The paper provides a qualitative comparison to MCP and A2A in the related work section and motivating examples, but no quantitative or systematic baseline comparison is conducted. The demonstrations only show AWCP in isolation without comparing to alternative approaches."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "While MCP and A2A are discussed as contemporary alternatives in Section 2.1, they are not used as baselines in the demonstration section. No head-to-head comparison is performed."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "The protocol has multiple components (transport adapters, state machines, control/data plane separation), but no ablation study is performed to measure the contribution of individual components."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No quantitative metrics are reported. The demonstrations are described qualitatively (task completed successfully) without measuring latency, overhead, error rates, or other metrics."
     85       },
     86       "human_evaluation": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "Human evaluation is not relevant to this protocol design paper. The claims are about protocol feasibility and design, not about output quality requiring human judgment."
     90       },
     91       "held_out_test_set": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "No benchmark or test set is used. This is a protocol design paper with live demonstrations, not a benchmark evaluation."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "The paper demonstrates two scenarios but provides no breakdown of performance across different task types, transport adapters, or failure conditions. Only success cases are shown."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No failure cases or error conditions are demonstrated or discussed in Section 5. The paper describes error states in the protocol specification (Section 3.2) but does not show any failures in practice."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "No negative results are reported. Both demonstrations succeed, and no failed configurations, transport adapter issues, or scalability problems are discussed."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The abstract claims AWCP 'bridges the context gap' and enables agents to 'operate on shared files directly with unmodified local toolchains.' While the protocol design supports this in principle, the validation consists only of two qualitative live demonstrations without quantitative evidence. The claim of 'validating the protocol' overstates what two demos show."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper makes causal claims such as 'workspace projection eliminates the context loss inherent in message-based coordination' (Section 6). This is asserted based on two demonstrations without controlled comparison to message-based approaches. No evidence is provided that context loss is actually reduced or eliminated."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper claims AWCP 'paves the way for a universally interoperable agent ecosystem' (abstract) and positions it as 'essential infrastructure for the Agentic Web' (Section 6), but validation consists of only two specific scenarios with specific model pairs. These broad claims significantly exceed the demonstrated scope."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper does not discuss alternative explanations for the success of its demonstrations. For example, it does not consider whether the demonstrated tasks could have been accomplished through existing mechanisms (e.g., A2A with file attachments, or shared Git repos) with comparable effectiveness."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper uses 'DeepSeek V3.2' and 'Gemini 3 Pro' in the demonstrations (Section 5.1). These are marketing names without specific API versions, snapshot dates, or model identifiers. No version information is given for the compliance stamping demo either."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The demonstrations describe tasks in natural language (e.g., 'classifying images by content, removing corrupted files, and reorganizing the directory semantically') but the actual prompts sent to the LLMs are not provided."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the LLM calls in the demonstrations."
    149       },
    150       "scaffolding_described": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The AWCP protocol itself is the scaffolding, and it is described in extensive detail across Sections 3 and 4, including the dual state machines, transport adapters, message protocol, and control/data plane architecture."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "The demonstration datasets (e.g., the image directory with 100+ files) are described only at a high level. No details on how they were constructed, what specific files they contain, or how the compliance stamping scenario was set up."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "There is no dedicated limitations section. Section 6 (Conclusion) mentions 'several directions remain open for future work' but frames these as future opportunities rather than limitations of the current work."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No threats to validity are discussed. The paper does not address potential issues such as security vulnerabilities of filesystem delegation, scalability limitations, network overhead, or the representativeness of its two demonstrations."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The paper does not explicitly state what the results do NOT show. The future work section mentions multi-party delegations, access control, and federated coalitions as open problems, but does not frame these as scope boundaries of the current evaluation."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No raw data from the demonstrations is available. The image directory, compliance documents, and execution logs are not released."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The demonstration scenarios are described at a high level but the actual data (images, documents) used is not characterized in detail. No description of how the cluttered image directory was constructed."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants are involved. The demonstrations use automated agents."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "The protocol pipeline is well-documented (Sections 3-4), but the data pipeline for the demonstrations (how input data was prepared, what outputs were produced, how success was determined) is not documented."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Funding is disclosed in the Acknowledgements section: 'This research was supported by National Natural Science Foundation of China (62322603 and 625B2185).'"
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Author affiliations are clearly listed: Harbin Institute of Technology, Shanghai Innovation Institute, Sun Yat-sen University, Tongji University, and Shanghai Jiao Tong University. The work is noted as accomplished at Shanghai Innovation Institute."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The National Natural Science Foundation of China is a government funding agency with no commercial stake in the AWCP protocol's success."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests statement is present. The paper references Holos [7], a 'web-scale collaboration platform' co-authored by several of the same authors (Nie, Guo, Zhou, Zhang appear on both papers), and AWCP is positioned as infrastructure for Holos-like systems, but this potential conflict is not disclosed."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "The paper does not evaluate pre-trained model capabilities on benchmarks. The demonstrations test the AWCP protocol, not model knowledge."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "No benchmark evaluation is performed. The paper tests a protocol design, not model performance on known tasks."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No benchmarks are used. This is a protocol design paper with live demonstrations."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this study."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved in this study."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in this study."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in this study."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in this study."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved in this study."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants are involved in this study."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "The demonstrations involve multiple LLM calls (DeepSeek V3.2, Gemini 3 Pro, OpenClaw) but no inference costs, token counts, latency measurements, or API costs are reported."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No computational budget is stated. The paper does not report hardware used, wall-clock time for demonstrations, or any resource consumption metrics."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "Existing agent collaboration protocols (MCP, A2A, ANP) operate strictly at the message layer, leaving a gap at the workspace level where agents need filesystem-level access.",
    292       "evidence": "Sections 1 and 2.1 describe the limitations of MCP (discrete function outputs), A2A (structured payloads), and ANP (communication channels), arguing none provide filesystem-level access. A motivating example of a security-auditing agent is given in Section 1.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "AWCP enables deep-engagement collaboration through workspace delegation, where an Executor operates directly on a Delegator's projected files using unmodified local toolchains.",
    297       "evidence": "Section 3 specifies the protocol framework, and Section 5 demonstrates two scenarios: cross-modal dataset curation (Section 5.1) and multi-round compliance stamping (Section 5.2). Both show the protocol functioning as designed.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "Workspace projection eliminates the context loss inherent in message-based coordination.",
    302       "evidence": "Stated in Section 6 based on the two demonstrations. No quantitative measurement of context loss or comparison to message-based alternatives is provided.",
    303       "supported": "weak"
    304     },
    305     {
    306       "claim": "The protocol accommodates diverse integration modes and pluggable transport adapters adapt to task characteristics.",
    307       "evidence": "Section 4.2 describes four transport adapters (SSHFS, Archive, Storage, Git) with Table 2 comparing properties. Demonstrations use SSHFS (Section 5.1) and Archive (Section 5.2), showing two of four adapters.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "AWCP provides foundational infrastructure for the Agentic Web enabling a universally interoperable agent ecosystem.",
    312       "evidence": "Stated in the abstract and contributions list. The reference implementation comprises ~9,200 lines of TypeScript with 161 test cases (Table 3). However, 'universally interoperable' is validated only with two specific agent configurations.",
    313       "supported": "weak"
    314     }
    315   ],
    316   "methodology_tags": [
    317     "theoretical",
    318     "case-study"
    319   ],
    320   "key_findings": "AWCP introduces a workspace delegation protocol that enables agents to share filesystem-level access rather than exchanging serialized messages. The protocol separates a control plane (HTTP/SSE signaling with dual state machines) from a pluggable transport plane (SSHFS, Archive, Storage, Git adapters). Two live demonstrations show cross-modal dataset curation (text-only agent delegating to multimodal agent) and multi-round compliance stamping, validating the protocol's feasibility for asymmetric agent collaboration. The open-source reference implementation comprises approximately 9,200 lines of TypeScript with 2,600 lines of tests.",
    321   "red_flags": [
    322     {
    323       "flag": "No quantitative evaluation",
    324       "detail": "The paper proposes a protocol and validates it through only two qualitative live demonstrations. No quantitative metrics (latency, overhead, throughput, error rates, task completion rates) are reported. The evaluation is insufficient to support claims about the protocol's practical utility at scale."
    325     },
    326     {
    327       "flag": "Claims outrun evidence",
    328       "detail": "The paper claims AWCP 'paves the way for a universally interoperable agent ecosystem' and is 'essential infrastructure for the Agentic Web,' but these broad claims are supported only by two hand-picked demonstrations with specific model pairs. No evidence is provided for generalizability."
    329     },
    330     {
    331       "flag": "Undisclosed conflict of interest",
    332       "detail": "Several authors (Nie, Guo, Zhou, Zhang) also authored the Holos paper [7], which describes a 'web-scale collaboration platform' that AWCP is positioned to serve. AWCP cites Holos as motivation (Section 1: 'even web-scale collaboration platforms such as Holos [7]... lack a standardized protocol'). This connection between the protocol and a commercial-adjacent platform is not disclosed as a potential conflict."
    333     },
    334     {
    335       "flag": "No failure analysis",
    336       "detail": "Both demonstrations succeed perfectly. No failure modes, edge cases, error recovery scenarios, or scalability stress tests are reported. The protocol defines error and cancellation states (Section 3.2) but these are never exercised in the demonstration."
    337     },
    338     {
    339       "flag": "No baseline comparison",
    340       "detail": "The paper does not compare AWCP against alternative approaches for the demonstrated tasks. For example, it does not show what happens when the same tasks are attempted via A2A with file attachments, shared Git repos, or other existing mechanisms."
    341     },
    342     {
    343       "flag": "No limitations section",
    344       "detail": "The paper lacks any discussion of limitations, threats to validity, or scope boundaries. Security implications of filesystem delegation, performance overhead of FUSE mounts, and scalability constraints are not addressed."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "A survey on large language model based autonomous agents",
    350       "authors": ["Lei Wang", "Chen Ma", "Xueyang Feng"],
    351       "year": 2024,
    352       "doi": "10.1007/s11704-024-40231-1",
    353       "relevance": "Comprehensive survey of LLM-based autonomous agents, foundational context for agent collaboration research."
    354     },
    355     {
    356       "title": "A survey of agent interoperability protocols: Model Context Protocol (MCP), Agent Communication Protocol (ACP), Agent-to-Agent Protocol (A2A), and Agent Network Protocol (ANP)",
    357       "authors": ["Abul Ehtesham", "Aditi Singh", "Gaurav Kumar Gupta", "Saket Kumar"],
    358       "year": 2025,
    359       "arxiv_id": "2505.02279",
    360       "relevance": "Survey of agent interoperability protocols directly relevant to evaluating the landscape AWCP positions itself within."
    361     },
    362     {
    363       "title": "Why do multi-agent LLM systems fail?",
    364       "authors": ["Mert Cemri", "Melissa Z. Pan", "Shuyi Yang"],
    365       "year": 2025,
    366       "arxiv_id": "2503.13657",
    367       "relevance": "Analysis of multi-agent LLM system failures, motivates the workspace-level coordination problem AWCP addresses."
    368     },
    369     {
    370       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation",
    371       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    372       "year": 2023,
    373       "arxiv_id": "2308.08155",
    374       "relevance": "Foundational multi-agent framework for LLM applications; AWCP positions itself as complementing such orchestration systems."
    375     },
    376     {
    377       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    378       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"],
    379       "year": 2024,
    380       "relevance": "Major benchmark for evaluating coding agents on real-world software engineering tasks."
    381     },
    382     {
    383       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    384       "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig"],
    385       "year": 2024,
    386       "relevance": "Defines agent-computer interfaces for software engineering agents, directly relevant to the workspace access problem."
    387     },
    388     {
    389       "title": "OpenHands: An open platform for AI software developers as generalist agents",
    390       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    391       "year": 2025,
    392       "relevance": "Open platform for coding agents with container-based sandboxing, related to the execution environment access problem."
    393     },
    394     {
    395       "title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
    396       "authors": ["Sirui Hong", "Mingchen Zhuge", "Jiaqi Chen"],
    397       "year": 2024,
    398       "relevance": "Multi-agent framework encoding standardized operating procedures, relevant to agent collaboration patterns."
    399     },
    400     {
    401       "title": "SEMAP: Software engineering multi-agent protocol",
    402       "authors": ["Zhenyu Mao", "Jacky Keung", "Fengji Zhang"],
    403       "year": 2025,
    404       "arxiv_id": "2510.12120",
    405       "relevance": "Protocol for lifecycle-guided multi-agent software engineering execution, directly comparable to AWCP's scope."
    406     },
    407     {
    408       "title": "Structured agentic software engineering",
    409       "authors": ["Ahmed E. Hassan", "Hao Li", "Dayi Lin"],
    410       "year": 2025,
    411       "arxiv_id": "2509.06216",
    412       "relevance": "Articulates vision for Agent Execution Environments in software engineering, theoretical framework related to AWCP."
    413     },
    414     {
    415       "title": "A survey of AI agent protocols",
    416       "authors": ["Yingxuan Yang", "Huacan Chai", "Yuanyi Song"],
    417       "year": 2025,
    418       "arxiv_id": "2504.16736",
    419       "relevance": "Survey of AI agent communication protocols, provides context for where AWCP fits in the protocol landscape."
    420     },
    421     {
    422       "title": "EvoGit: Decentralized code evolution via git-based multi-agent collaboration",
    423       "authors": ["Beichen Huang", "Ran Cheng", "Kay Chen Tan"],
    424       "year": 2025,
    425       "arxiv_id": "2506.02049",
    426       "relevance": "Git-based multi-agent coordination for coding agents, directly related to workspace-level collaboration."
    427     }
    428   ]
    429 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs