ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (18903B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "position",
      4   "paper": {
      5     "title": "Towards a Declarative Agentic Layer for Intelligent Agents in MCP-Based Server Ecosystems",
      6     "authors": [
      7       "María Jesús Rodríguez-Sánchez",
      8       "Manuel Noguera",
      9       "Ángel Ruiz-Zafra",
     10       "K. Benghazi"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2601.17435",
     15     "doi": "10.48550/arXiv.2601.17435"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The abstract claims DALIA 'enables reproducible and verifiable agentic workflows' but the paper only provides a single illustrative scenario with no empirical evidence of reproducibility or verification.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "Section 3 and abstract use causal language: 'reduces reliance on speculative reasoning', 'reduces the likelihood of hallucinated actions'. No empirical evidence supports these causal claims; they are argued from design principles only.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper claims broad applicability ('heterogeneous environments') but only illustrates with a trivial restaurant booking scenario. The title suggests generality across 'MCP-Based Server Ecosystems' without bounding to the tested scope.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "No empirical results to offer alternative explanations for. This is a theoretical architecture proposal.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No measurements taken; purely theoretical paper.",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 5 (Discussion and Future Directions) explicitly acknowledges that 'empirical evaluation is required' and discusses current limitations and future work.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Section 5 mentions the need for empirical evaluation and richer capability semantics but does not identify specific threats to validity of the proposed architecture (e.g., scalability of deterministic planning, expressiveness limitations of the semantic model).",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Section 5 explicitly states: 'DALIA does not prescribe how task graphs are generated internally, nor does it mandate a particular planning algorithm' and notes the need for 'empirical evaluation...particularly when compared to fully prompt-driven approaches.'",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding or acknowledgments section is present in the paper.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors are listed as affiliated with Universidad de Granada.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No funding information disclosed; cannot assess funder independence.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement is present.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are defined structurally: 'agent' (execution entity linked to MCP servers), 'capability' (executable operation with semantic attributes), 'task' (higher-level objective), 'declarative' (structured metadata). Definitions are present, though some terms like 'verifiable' are used without precise definition.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Contribution is explicit: propose DALIA, a declarative architectural layer with four components (capability semantic model, ATDP, Agent Directory MCP, deterministic orchestrator) to ground agentic workflows.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Strong engagement: Table 1 systematically compares DALIA to six prior approaches (MAST, CODER, AFlow/MAS-GPT, MCP, ScaleMCP/MCPEval, agentic AI surveys), identifying specific limitations each addresses. Discussion shows how DALIA builds on and differs from existing work.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "position": {
    119       "argument_quality": {
    120         "argument_internally_consistent": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Argument is logically coherent: (1) MAS fail at high rates, (2) failures stem from structural lack of capability grounding, (3) declarative representation can provide this grounding, (4) therefore DALIA should help. No internal contradictions detected.",
    124           "source": "haiku"
    125         },
    126         "counterarguments_addressed": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "Paper does not engage with strongest opposing views. Does not discuss: 'Why not improve prompting?', 'Why not use existing workflow orchestration tools?', 'Is this over-engineered for simple scenarios?' Absence of counterargument engagement is a critical gap for a position paper.",
    130           "source": "haiku"
    131         },
    132         "analogies_appropriate": {
    133           "applies": false,
    134           "answer": false,
    135           "justification": "Paper uses no explicit analogies to external systems or domains.",
    136           "source": "haiku"
    137         },
    138         "prescriptions_proportional": {
    139           "applies": true,
    140           "answer": true,
    141           "justification": "Paper prescribes building DALIA, a multi-component architecture. Prescription is proportional to argument strength: diagnosis of lack of structure supports recommendation to add structure. However, no alternatives proposed, which limits proportionality assessment.",
    142           "source": "haiku"
    143         },
    144         "evidence_for_claims_cited": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "Factual claims are cited: MAS failure rates [5], MAST taxonomy [5], task graph incoherence [4,9], MCP limitations [12]. However, prescriptive claims about DALIA's benefits (reducing hallucinations, improving reproducibility) are asserted without cited evidence.",
    148           "source": "haiku"
    149         },
    150         "alternatives_discussed": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "Paper proposes DALIA without discussing alternative approaches to the same problem. Should address: improved prompting strategies, fine-tuning methods, simpler architectural changes, or why existing tools (Airflow, Temporal) are insufficient.",
    154           "source": "haiku"
    155         },
    156         "historical_context_accurate": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "References to MCP emergence, multi-agent systems history (MetaGPT, ChatDev, AgentVerse), and agentic AI surveys appear factually accurate and appropriately contextualized.",
    160           "source": "haiku"
    161         }
    162       },
    163       "clarity_and_scope": {
    164         "key_terms_defined_precisely": {
    165           "applies": true,
    166           "answer": true,
    167           "justification": "Key terms are defined with context: 'agent' (Listing 1 and Listing 4 show structure), 'capability' (Listing 2 provides semantic model), 'task' (Listing 3 shows task declaration). Definitions are precise enough for the architectural discussion.",
    168           "source": "haiku"
    169         },
    170         "engages_with_existing_literature": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Paper substantively engages with literature: cites specific findings from MAST (41-86% failure rates, 14 failure modes), CODER (task graphs), AFlow/MAS-GPT (incoherence), and MCP ecosystem. Table 1 provides structured comparison showing how DALIA addresses gaps identified by each prior approach.",
    174           "source": "haiku"
    175         },
    176         "intended_audience_clear": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Audience is reasonably clear: MCP ecosystem developers, multi-agent system researchers, and practitioners building agentic systems. Target is implicit but supportable from the focus on MCP compatibility and architectural design.",
    180           "source": "haiku"
    181         },
    182         "assumptions_stated": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "Core assumptions are stated: (1) MCP is semantically under-specified, (2) agents cannot reliably generate valid plans without grounding, (3) declarative representation is superior to linguistic inference. Not exhaustive, but primary assumptions are explicit.",
    186           "source": "haiku"
    187         },
    188         "scope_of_applicability_discussed": {
    189           "applies": true,
    190           "answer": false,
    191           "justification": "Paper does not discuss where DALIA applies vs. does not apply. No discussion of domain constraints, scalability limits, task complexity requirements, or scenarios where declarative grounding might be insufficient or harmful.",
    192           "source": "haiku"
    193         }
    194       }
    195     }
    196   },
    197   "claims": [
    198     {
    199       "claim": "Multi-agent LLM systems fail at rates of 41-86% across seven frameworks",
    200       "evidence": "Cemri et al. [5] 'examines 1,642 executions across seven MAS frameworks'",
    201       "supported": "strong"
    202     },
    203     {
    204       "claim": "Failures stem from structural design flaws (system design, agent alignment, verification), not model limitations",
    205       "evidence": "MAST taxonomy [5] identifying three failure categories, Section 2 discussion of Grounded Theory analysis",
    206       "supported": "strong"
    207     },
    208     {
    209       "claim": "LLM-generated task graphs are frequently incoherent or unexecutable",
    210       "evidence": "Cited to [4] and [9]; Section 2 states 'graphs produced through unconstrained language modelling tend to be incomplete, incoherent, or ungrounded'",
    211       "supported": "moderate"
    212     },
    213     {
    214       "claim": "MCP exposes tools without semantic relationships, task-level structure, or multi-server coordination",
    215       "evidence": "Section 2 description of MCP limitations, noting lack of 'semantic descriptions, dependencies, roles or relations'",
    216       "supported": "moderate"
    217     },
    218     {
    219       "claim": "Declarative grounding constrains agent behavior to a verifiable operational space",
    220       "evidence": "Section 3.4 and restaurant scenario (Section 4) demonstrate deterministic graph construction from declared capabilities",
    221       "supported": "weak"
    222     },
    223     {
    224       "claim": "Strict separation of discovery, planning, and execution prevents hallucinated actions and unexecutable workflows",
    225       "evidence": "Architectural design principle stated in Section 3.5; claimed but not empirically validated",
    226       "supported": "unsupported"
    227     },
    228     {
    229       "claim": "DALIA enables reproducible and verifiable agentic workflows across heterogeneous environments",
    230       "evidence": "Abstract and Section 4 restaurant scenario only; no comparison to baseline systems or evaluation at scale",
    231       "supported": "weak"
    232     }
    233   ],
    234   "methodology_tags": [
    235     "theoretical",
    236     "case-study"
    237   ],
    238   "key_findings": "The paper identifies that multi-agent LLM systems suffer 41-86% failure rates due to structural limitations—lack of explicit capability grounding, semantic tool descriptions, and coordinated planning mechanisms—not model inadequacy. DALIA addresses this gap through a four-component declarative architectural layer: (1) a capability semantic model enriching tool definitions with preconditions/postconditions, (2) an Agentic Task Discovery Protocol for declarative task exposure, (3) a federated Agent Directory, and (4) deterministic task orchestration grounded exclusively in declared capabilities. The paper illustrates DALIA via a restaurant booking scenario showing how phased discovery, planning, and execution prevent hallucinated actions and enable reproducible workflows.",
    239   "red_flags": [
    240     {
    241       "flag": "No empirical evaluation",
    242       "detail": "Paper proposes DALIA but provides zero empirical evidence of effectiveness. No user study, no performance benchmarks, no comparison to existing systems or simpler baselines. Only a toy scenario provided."
    243     },
    244     {
    245       "flag": "Unsupported causal claims",
    246       "detail": "Claims that declarative representation reduces hallucinations and improves reliability are asserted without causal evidence (no ablation studies, no RCT, no control group)."
    247     },
    248     {
    249       "flag": "No counterargument engagement",
    250       "detail": "For a position paper, critical weakness: does not address why improved prompting, fine-tuning, or simpler workflow tools (Airflow, Temporal) would be insufficient. No discussion of alternatives."
    251     },
    252     {
    253       "flag": "Overclaimed scope",
    254       "detail": "Paper claims DALIA applies to 'heterogeneous environments' and 'dynamic environments' but demonstrates only a single, static restaurant scenario. Generality is not bounded or validated."
    255     },
    256     {
    257       "flag": "Vague on implementation",
    258       "detail": "Section 3.5 states 'the internal reasoning mechanisms used to guide task selection or graph construction are intentionally left unspecified.' Core algorithmic detail is omitted."
    259     },
    260     {
    261       "flag": "Minimal limitations discussion",
    262       "detail": "Section 5 provides placeholder future work ('empirical evaluation is required') rather than honest assessment of DALIA's limitations, operational overhead, or when it might fail."
    263     },
    264     {
    265       "flag": "No operational complexity analysis",
    266       "detail": "Maintaining three catalogs (capabilities, tasks, agents) and federated directory has non-zero overhead and coordination burden; not discussed."
    267     },
    268     {
    269       "flag": "Missing funding and COI disclosure",
    270       "detail": "No statement of funding source or competing interests, despite authors being from a single institution."
    271     }
    272   ],
    273   "cited_papers": [
    274     {
    275       "title": "Why do multi-agent llm systems fail? (MAST taxonomy)",
    276       "relevance": "Identifies 14 recurring failure modes across three categories; motivates need for structural architectural changes rather than prompting."
    277     },
    278     {
    279       "title": "Graphs meet ai agents: Taxonomy, progress, and future opportunities",
    280       "relevance": "Demonstrates that LLM-generated task graphs are incomplete, incoherent, or ungrounded in real capabilities."
    281     },
    282     {
    283       "title": "LLM-based multi-agent systems for software engineering: Literature review",
    284       "relevance": "Surveys current agentic AI approaches; shows reliance on linguistic inference rather than grounded operational models."
    285     },
    286     {
    287       "title": "Model context protocol (MCP): Landscape, security threats, and future research directions",
    288       "relevance": "Technical foundation for DALIA; identifies semantic limitations of current MCP (lacks task-level structure, dependency representations)."
    289     },
    290     {
    291       "title": "MetaGPT: Multi-agent framework",
    292       "relevance": "Example of existing multi-agent system architecture; illustrates brittleness and domain-specificity of current approaches."
    293     },
    294     {
    295       "title": "CODER: Issue resolving with multi-agent and task graphs",
    296       "relevance": "Shows that predefined task graphs improve reliability; motivates DALIA's declarative graph construction from formal capability descriptions."
    297     }
    298   ],
    299   "engagement_factors": {
    300     "practical_relevance": {
    301       "score": 1,
    302       "justification": "Proposes an architectural pattern for MCP-based agents that practitioners could conceptually adopt, but no implementation, library, or code exists to use."
    303     },
    304     "surprise_contrarian": {
    305       "score": 0,
    306       "justification": "The claim that MAS failures stem from architectural gaps rather than model limitations is a common position in the systems/engineering community, not a surprising finding."
    307     },
    308     "fear_safety": {
    309       "score": 0,
    310       "justification": "No safety, security, or risk concerns are raised; the paper focuses entirely on reliability and architectural structure."
    311     },
    312     "drama_conflict": {
    313       "score": 0,
    314       "justification": "No controversy, no challenge to specific companies or products, and no replication failure — purely a constructive architectural proposal."
    315     },
    316     "demo_ability": {
    317       "score": 0,
    318       "justification": "No code, no implementation, no prototype — only JSON pseudocode snippets illustrating a theoretical architecture."
    319     },
    320     "brand_recognition": {
    321       "score": 0,
    322       "justification": "Authors are from Universidad de Granada with no major industry affiliation, and the work is not associated with any well-known product or lab."
    323     }
    324   },
    325   "hn_data": {
    326     "threads": [
    327       {
    328         "hn_id": "46802368",
    329         "title": "Show HN: If You Want Coherence, Orchestrate a Team of Rivals: Multi-Agent \"",
    330         "points": 10,
    331         "comments": 0,
    332         "url": "https://news.ycombinator.com/item?id=46802368",
    333         "created_at": "2026-01-28T22:16:52Z"
    334       },
    335       {
    336         "hn_id": "46776398",
    337         "title": "The 17% Gap: Quantifying Epistemic Decay in AI-Assisted Survey Papers",
    338         "points": 1,
    339         "comments": 1,
    340         "url": "https://news.ycombinator.com/item?id=46776398",
    341         "created_at": "2026-01-27T06:58:02Z"
    342       },
    343       {
    344         "hn_id": "46913890",
    345         "title": "Predicting Zero-Shot Classification Performance for Arbitrary Queries",
    346         "points": 1,
    347         "comments": 0,
    348         "url": "https://news.ycombinator.com/item?id=46913890",
    349         "created_at": "2026-02-06T15:19:31Z"
    350       }
    351     ],
    352     "top_points": 10,
    353     "total_points": 12,
    354     "total_comments": 1
    355   }
    356 }

Impressum · Datenschutz