ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (20610B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "position",
      4   "paper": {
      5     "title": "Towards a Declarative Agentic Layer for Intelligent Agents in MCP-Based Server Ecosystems",
      6     "authors": [
      7       "María Jesús Rodríguez-Sánchez",
      8       "Manuel Noguera",
      9       "Ángel Ruiz-Zafra",
     10       "K. Benghazi"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2601.17435",
     15     "doi": "10.48550/arXiv.2601.17435"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The abstract claims that agent failures 'do not stem from limitations of the underlying models themselves, but from the absence of explicit architectural structure,' but the paper provides no comparative evidence (ablation, controlled study, or data) to distinguish model limitations from architectural ones. Section 2 cites existing survey results but does not isolate architectural factors.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper makes causal claims ('declarative grounding enables reproducible workflows,' 'introducing explicit grounding can constrain behaviour without limiting expressiveness') without justification. Section 4 acknowledges the illustrative scenario is 'not to demonstrate optimisation or performance.' No ablation, comparison, or empirical test provided.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper claims DALIA is 'model-independent,' applies to 'heterogeneous environments' and 'dynamic environments,' but evidence is limited to one toy scenario (restaurant booking). Scope of applicability is not bounded to the tested/argued setting.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not discuss why other structural approaches (e.g., hierarchical planning, constraint-based reasoning, adaptive prompting) might address the core problem. Alternative explanations for MAS failures are not considered.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims DALIA improves 'reliability,' 'robustness,' 'reproducibility,' and 'verifiability,' but admits in Section 5: 'empirical evaluation is required to assess how declarative grounding affects reliability, robustness and failure rates.' No actual outcomes are measured.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Section 5 is titled 'Discussion and Future Directions,' not 'Limitations.' One sentence mentions 'DALIA does not prescribe how task graphs are generated internally,' but no dedicated limitations section exists.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Specific threats are not discussed. The paper states 'further work is required' and lists gaps (need for empirical evaluation, richer semantics, decentralized coordination) as future directions, but does not analyze validity threats in the current proposal.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not explicitly state what it does NOT address: how the LLM generates the user goal, error recovery strategies during execution, or how the orchestrator maps abstract goals to ATDP tasks. Scope boundaries are implicit, not stated.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding section or acknowledgments section present. No funding sources are disclosed.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "All authors list Universidad de Granada as affiliation. No conflict-of-interest statement or disclosure of affiliations with MCP or related tools is provided.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding disclosed; cannot assess independence.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting related to MCP or agent systems) is provided.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are defined in context: 'Agent' as 'execution entities rather than autonomous planners' (§3.3), 'Capability' with formal schema (§3.1), 'Task' as 'higher-level objective fulfilled through declared capabilities' (§3.2). MCP is explained as a 'lightweight standard for tool discovery.'",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The abstract explicitly states four contributions: (1) formal semantic model of capabilities, (2) Agentic Task Discovery Protocol, (3) federated Agent Directory, (4) deterministic task orchestration. The intended contribution is unambiguous.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 reviews Cemri et al. (MAS failures), MAST taxonomy, task graph literature, AFlow/MAS-GPT, MCP standard, and ScaleMCP. Table 1 systematically maps prior limitations to DALIA's proposed solutions. Strong engagement demonstrated.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "position": {
    119       "argument_quality": {
    120         "argument_internally_consistent": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Core argument is consistent: MAS failures stem from architectural lack of structure → declarative grounding provides structure → workflows improve. Sections 2–4 develop this logically without contradictions.",
    124           "source": "haiku"
    125         },
    126         "counterarguments_addressed": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "The paper does not engage with counterarguments: Why won't better prompting address these issues? Why is architectural change necessary rather than improved models? These are implicitly dismissed but never argued against.",
    130           "source": "haiku"
    131         },
    132         "analogies_appropriate": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "The closing mentions alignment with 'service-oriented architectures and workflow systems,' which is an apt structural analogy. No false equivalences or extended inappropriate analogies.",
    136           "source": "haiku"
    137         },
    138         "prescriptions_proportional": {
    139           "applies": true,
    140           "answer": true,
    141           "justification": "The prescription (adopt declarative grounding in agentic architectures) is proportional to the argument (evidence of structural failures in MAS). No sweeping policy claims or overreach.",
    142           "source": "haiku"
    143         },
    144         "evidence_for_claims_cited": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "Factual claims are cited: Cemri et al. [5] for failure rates, [4] and [9] for task graph issues, [3], [11], [14] for MCP limitations. Citations support empirical assertions.",
    148           "source": "haiku"
    149         },
    150         "alternatives_discussed": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "Section 5 mentions alternative orchestration strategies (symbolic planners, heuristic search, rule-based, LLM-driven) but does NOT discuss alternative solutions to the core problem: prompt engineering, larger models, hybrid human-AI systems, or hierarchical planning approaches.",
    154           "source": "haiku"
    155         },
    156         "historical_context_accurate": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "References to LLM capabilities, MCP emergence, multi-agent frameworks (MetaGPT, ChatDev, AgentVerse) are accurate. No historical inaccuracies detected.",
    160           "source": "haiku"
    161         }
    162       },
    163       "clarity_and_scope": {
    164         "key_terms_defined_precisely": {
    165           "applies": true,
    166           "answer": true,
    167           "justification": "Key terms are defined with precision: 'capability' includes formal attributes (role, domain, inputs, outputs, preconditions, postconditions); 'task' is explicitly tied to capability composition; 'agent' is defined as execution entity, not planner.",
    168           "source": "haiku"
    169         },
    170         "engages_with_existing_literature": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Section 2 cites and compares against six major research directions (MAST, CODER, AFlow, MCP, ScaleMCP, agentic AI surveys). Table 1 shows explicit tradeoff analysis.",
    174           "source": "haiku"
    175         },
    176         "intended_audience_clear": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "The paper is clearly targeted at researchers and architects building agentic AI systems, particularly those working with MCP-based ecosystems. Audience is implicit but unambiguous from scope.",
    180           "source": "haiku"
    181         },
    182         "assumptions_stated": {
    183           "applies": true,
    184           "answer": false,
    185           "justification": "The paper assumes MCP servers will provide DALIA-compatible metadata, that tasks can be declared upfront, and that deterministic planning is preferable to adaptive LLM reasoning. These assumptions are not made explicit.",
    186           "source": "haiku"
    187         },
    188         "scope_of_applicability_discussed": {
    189           "applies": true,
    190           "answer": false,
    191           "justification": "The paper does not discuss where declarative grounding is applicable vs. not applicable. Does it work for open-ended exploration? For real-time dynamic task generation? For adversarial environments? Scope is not discussed.",
    192           "source": "haiku"
    193         }
    194       }
    195     }
    196   },
    197   "claims": [
    198     {
    199       "claim": "Multi-agent LLM systems have failure rates between 41% and 86% across MAS frameworks",
    200       "evidence": "Cemri et al. analysis of 1,642 executions across seven frameworks (cited [5])",
    201       "supported": "strong"
    202     },
    203     {
    204       "claim": "LLM-based agents produce hallucinated actions and unexecutable task graphs because they lack grounding in real capabilities",
    205       "evidence": "Cited prior work [4], [9]; noted in Section 2 but not directly empirically demonstrated in this paper",
    206       "supported": "moderate"
    207     },
    208     {
    209       "claim": "MCP tools lack semantic relationships, task structure, and multi-server coordination capabilities",
    210       "evidence": "Technical observation; cited extensions (ScaleMCP [11], MCPEval [14]) acknowledged as operational only, not semantic (Section 2)",
    211       "supported": "moderate"
    212     },
    213     {
    214       "claim": "Declarative grounding of capabilities, tasks, and agents reduces hallucinated actions and invalid task graphs",
    215       "evidence": "Proposed by DALIA design; no empirical validation provided; admitted in Section 5 that 'empirical evaluation is required'",
    216       "supported": "unsupported"
    217     },
    218     {
    219       "claim": "DALIA enables reproducible and verifiable agentic workflows across heterogeneous environments",
    220       "evidence": "Architectural argument; single illustrative restaurant booking scenario (acknowledged as illustrative, not evaluative)",
    221       "supported": "unsupported"
    222     },
    223     {
    224       "claim": "Separating discovery, planning, and execution prevents speculative reasoning and improves reliability",
    225       "evidence": "Design principle stated; no ablation or comparative study provided",
    226       "supported": "unsupported"
    227     },
    228     {
    229       "claim": "Task graphs generated without awareness of real capabilities result in incoherent routes and hallucinated operations",
    230       "evidence": "Cited prior work [4], [9]; empirical examples not provided",
    231       "supported": "moderate"
    232     },
    233     {
    234       "claim": "MAS failures stem from lack of architectural structure, not model limitations",
    235       "evidence": "Inferred from Cemri et al. MAST taxonomy (system design, agent alignment, verification failures); not directly proven by ablation",
    236       "supported": "moderate"
    237     }
    238   ],
    239   "methodology_tags": [
    240     "position",
    241     "theoretical"
    242   ],
    243   "key_findings": "This paper proposes DALIA, a declarative architectural layer for grounding LLM-based agent systems through explicit specification of capabilities, tasks, and agent roles. Rather than relying on linguistic reasoning for planning and coordination, DALIA separates discovery, planning, and execution into distinct phases governed by declarative metadata. The authors motivate DALIA by citing high failure rates (41-86%) in existing multi-agent systems and limitations of MCP's tool abstraction, but provide only an illustrative scenario (restaurant booking) and no empirical validation of the proposed approach. The paper acknowledges that 'empirical evaluation is required' and does not implement or test DALIA.",
    244   "red_flags": [
    245     {
    246       "flag": "No empirical validation",
    247       "detail": "The paper admits in Section 5: 'empirical evaluation is required to assess how declarative grounding affects reliability, robustness and failure rates.' No controlled comparison, ablation study, or implementation testing is provided."
    248     },
    249     {
    250       "flag": "Single toy scenario",
    251       "detail": "The illustrative scenario (restaurant booking with two steps) is acknowledged as not demonstrating 'optimisation or performance.' It is too simple to validate claims about heterogeneous multi-agent systems."
    252     },
    253     {
    254       "flag": "Vague on implementation details",
    255       "detail": "How does ATDP actually work? How are task graphs 'synthesized deterministically'? Section 5 states 'internal reasoning mechanisms...intentionally left unspecified,' undermining the core argument that explicit structure matters."
    256     },
    257     {
    258       "flag": "Adoption problem not addressed",
    259       "detail": "The paper assumes MCP servers will voluntarily provide DALIA-compatible metadata (capability schemas, ATDP endpoints, Agent Directory entries). No discussion of incentives, migration path, or compatibility with existing MCP ecosystems."
    260     },
    261     {
    262       "flag": "Circular reasoning on causation",
    263       "detail": "The paper defines the problem as 'lack of declarative structure' and proposes 'declarative structure' as solution, but provides no causal evidence that structure—not prompt quality, not model capacity—is the binding constraint."
    264     },
    265     {
    266       "flag": "Missing conflict-of-interest disclosures",
    267       "detail": "No funding source, no competing interests statement, no disclosure of affiliations with MCP tools or agent frameworks. All required for position papers proposing adoption of specific architectures."
    268     },
    269     {
    270       "flag": "No limitations section",
    271       "detail": "Section 5 is titled 'Discussion and Future Directions' rather than 'Limitations.' One mention of 'current limitations' but no systematic analysis of scope boundaries or threats to the proposal's validity."
    272     },
    273     {
    274       "flag": "Implicit assumptions not stated",
    275       "detail": "Assumes tasks can be declared upfront, deterministic planning is preferable to adaptive reasoning, and that 'hallucinated actions' are primarily an architectural problem. These are not made explicit for reader evaluation."
    276     }
    277   ],
    278   "cited_papers": [
    279     {
    280       "title": "Why do multi-agent LLM systems fail? (MAST: Multi-Agent System Taxonomy)",
    281       "authors": "Eren Cemri et al.",
    282       "year": 2025,
    283       "relevance": "Empirical taxonomy of 1,642 MAS executions identifying failure modes in system design, agent alignment, and verification—core motivation for DALIA"
    284     },
    285     {
    286       "title": "CODER: Issue resolving with multi-agent and task graphs",
    287       "authors": "Dong Chen et al.",
    288       "year": 2024,
    289       "relevance": "Demonstrates task graph approach to coordinating agents; cited for limitation that LLM-generated graphs are incoherent or ungrounded"
    290     },
    291     {
    292       "title": "AFlow: Large language models as multi-agent system engineers",
    293       "authors": "Leyang Zhang et al.",
    294       "year": 2024,
    295       "relevance": "Example of LLM-generated MAS producing incoherent structures; motivates need for explicit grounding"
    296     },
    297     {
    298       "title": "Graphs meet AI agents: Taxonomy, progress, and future opportunities",
    299       "authors": "Yuanchen Bei et al.",
    300       "year": 2025,
    301       "relevance": "Demonstrates LLMs cannot reliably generate task graphs; cites evidence that unconstrained language modelling produces incomplete or incoherent plans"
    302     },
    303     {
    304       "title": "LLM-based multi-agent systems for software engineering: Literature review, vision, and the road ahead",
    305       "authors": "Junda He, Christoph Treude, David Lo",
    306       "year": 2025,
    307       "relevance": "Survey emphasizing that agentic AI systems operate without explicit grounding in available actions—core problem DALIA addresses"
    308     },
    309     {
    310       "title": "Generative to agentic AI: Survey, conceptualization, and challenges",
    311       "authors": "Jonas Schneider et al.",
    312       "year": 2024,
    313       "relevance": "Prior survey arguing agentic architectures require planning, memory, and tool use; context for DALIA's architecture proposal"
    314     }
    315   ],
    316   "engagement_factors": {
    317     "practical_relevance": {
    318       "score": 2,
    319       "justification": "DALIA is proposed as a practical architectural layer for production multi-agent systems, but no implementation, reference code, or prototype is released or available, limiting immediate practitioner use."
    320     },
    321     "surprise_contrarian": {
    322       "score": 1,
    323       "justification": "The paper argues for declarative structure over prompt-driven orchestration, but this is more 'engineering best practice' than a contrarian position. Not framed as challenging conventional wisdom."
    324     },
    325     "fear_safety": {
    326       "score": 0,
    327       "justification": "No discussion of safety implications, alignment, or risk. The paper focuses on reliability and verifiability, not safety concerns."
    328     },
    329     "drama_conflict": {
    330       "score": 0,
    331       "justification": "A technical architecture proposal with no controversy, debate, or conflicting perspectives. No drama angle."
    332     },
    333     "demo_ability": {
    334       "score": 0,
    335       "justification": "No reference implementation, no code release, no working prototype. The illustrative scenario is described in prose, not demonstrated interactively."
    336     },
    337     "brand_recognition": {
    338       "score": 1,
    339       "justification": "Universidad de Granada is a legitimate institution, but not a major AI lab. Authors are not well-known in agentic AI research."
    340     }
    341   },
    342   "hn_data": {
    343     "threads": [
    344       {
    345         "hn_id": "46802368",
    346         "title": "Show HN: If You Want Coherence, Orchestrate a Team of Rivals: Multi-Agent \"",
    347         "points": 10,
    348         "comments": 0,
    349         "url": "https://news.ycombinator.com/item?id=46802368",
    350         "created_at": "2026-01-28T22:16:52Z"
    351       },
    352       {
    353         "hn_id": "46776398",
    354         "title": "The 17% Gap: Quantifying Epistemic Decay in AI-Assisted Survey Papers",
    355         "points": 1,
    356         "comments": 1,
    357         "url": "https://news.ycombinator.com/item?id=46776398",
    358         "created_at": "2026-01-27T06:58:02Z"
    359       },
    360       {
    361         "hn_id": "46913890",
    362         "title": "Predicting Zero-Shot Classification Performance for Arbitrary Queries",
    363         "points": 1,
    364         "comments": 0,
    365         "url": "https://news.ycombinator.com/item?id=46913890",
    366         "created_at": "2026-02-06T15:19:31Z"
    367       }
    368     ],
    369     "top_points": 10,
    370     "total_points": 12,
    371     "total_comments": 1
    372   }
    373 }

Impressum · Datenschutz