scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (17883B)
      1 {
      2   "paper": {
      3     "title": "Towards a Declarative Agentic Layer for Intelligent Agents in MCP-Based Server Ecosystems",
      4     "authors": [
      5       "María Jesús Rodríguez-Sánchez",
      6       "Manuel Noguera",
      7       "Ángel Ruiz-Zafra",
      8       "Kawtar Benghazi"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv",
     12     "arxiv_id": "2601.17435",
     13     "doi": "10.48550/arXiv.2601.17435"
     14   },
     15   "scan_version": 3,
     16   "active_modules": [],
     17   "methodology_tags": [
     18     "theoretical"
     19   ],
     20   "key_findings": "The paper proposes DALIA, a declarative architectural layer for MCP-based agentic systems that formalises capabilities, tasks, and agent directories to enable deterministic task graph construction. It argues that current MAS failures stem from lack of architectural structure rather than model limitations, citing empirical work showing 41-86% failure rates in existing frameworks. The architecture is illustrated through a restaurant booking scenario but not empirically evaluated.",
     21   "claims": [
     22     {
     23       "claim": "Current MAS failures stem from absence of explicit architectural structure, not from model limitations.",
     24       "evidence": "Section 2 cites Cemri et al. reporting 41-86% failure rates across 1,642 executions of seven MAS frameworks, with failures categorized into system-design flaws, agent misalignment, and deficient verification.",
     25       "supported": "moderate"
     26     },
     27     {
     28       "claim": "DALIA's deterministic task orchestration constructs executable and verifiable task graphs grounded exclusively in declared operations.",
     29       "evidence": "Section 3.4 describes the mechanism and Section 4 illustrates it through a restaurant booking scenario, but no implementation or empirical evaluation is provided.",
     30       "supported": "weak"
     31     },
     32     {
     33       "claim": "Declarative grounding enables reproducible and verifiable agentic workflows across heterogeneous environments.",
     34       "evidence": "Abstract and Section 4 claim this based on architectural design principles and a single illustrative scenario. No empirical demonstration of reproducibility or verification.",
     35       "supported": "weak"
     36     }
     37   ],
     38   "checklist": {
     39     "artifacts": {
     40       "code_released": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No source code, repository URL, or implementation artifact is mentioned anywhere in the paper."
     44       },
     45       "data_released": {
     46         "applies": false,
     47         "answer": false,
     48         "justification": "Purely theoretical/architectural paper with no data to release."
     49       },
     50       "environment_specified": {
     51         "applies": false,
     52         "answer": false,
     53         "justification": "No implementation or experiments; environment specifications are not applicable."
     54       },
     55       "reproduction_instructions": {
     56         "applies": false,
     57         "answer": false,
     58         "justification": "No experiments to reproduce; the paper presents an architecture with an illustrative scenario only."
     59       }
     60     },
     61     "statistical_methodology": {
     62       "confidence_intervals_or_error_bars": {
     63         "applies": false,
     64         "answer": false,
     65         "justification": "Theoretical paper with no quantitative results."
     66       },
     67       "significance_tests": {
     68         "applies": false,
     69         "answer": false,
     70         "justification": "No comparative experiments performed."
     71       },
     72       "effect_sizes_reported": {
     73         "applies": false,
     74         "answer": false,
     75         "justification": "No quantitative results."
     76       },
     77       "sample_size_justified": {
     78         "applies": false,
     79         "answer": false,
     80         "justification": "Theoretical paper with no samples."
     81       },
     82       "variance_reported": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No experimental runs."
     86       }
     87     },
     88     "evaluation_design": {
     89       "baselines_included": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "Table 1 lists prior approaches and their limitations but does not compare DALIA against them empirically. There is no evaluation baseline."
     93       },
     94       "baselines_contemporary": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "No empirical evaluation to have baselines for."
     98       },
     99       "ablation_study": {
    100         "applies": false,
    101         "answer": false,
    102         "justification": "No implementation or evaluation; ablation is not applicable."
    103       },
    104       "multiple_metrics": {
    105         "applies": false,
    106         "answer": false,
    107         "justification": "No quantitative evaluation."
    108       },
    109       "human_evaluation": {
    110         "applies": false,
    111         "answer": false,
    112         "justification": "No evaluation of any kind."
    113       },
    114       "held_out_test_set": {
    115         "applies": false,
    116         "answer": false,
    117         "justification": "No data or evaluation."
    118       },
    119       "per_category_breakdown": {
    120         "applies": false,
    121         "answer": false,
    122         "justification": "No quantitative results to break down."
    123       },
    124       "failure_cases_discussed": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper discusses failure modes of existing systems (Section 2) but does not discuss potential failure cases of DALIA itself."
    128       },
    129       "negative_results_reported": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "No experiments; no negative results reported."
    133       }
    134     },
    135     "claims_and_evidence": {
    136       "abstract_claims_supported": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The abstract claims DALIA 'enables reproducible and verifiable agentic workflows' but the paper only provides a single illustrative scenario with no empirical evidence of reproducibility or verification."
    140       },
    141       "causal_claims_justified": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "Section 3 and abstract use causal language: 'reduces reliance on speculative reasoning', 'reduces the likelihood of hallucinated actions'. No empirical evidence supports these causal claims; they are argued from design principles only."
    145       },
    146       "generalization_bounded": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The paper claims broad applicability ('heterogeneous environments') but only illustrates with a trivial restaurant booking scenario. The title suggests generality across 'MCP-Based Server Ecosystems' without bounding to the tested scope."
    150       },
    151       "alternative_explanations_discussed": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No empirical results to offer alternative explanations for. This is a theoretical architecture proposal."
    155       },
    156       "proxy_outcome_distinction": {
    157         "applies": false,
    158         "answer": false,
    159         "justification": "No measurements taken; purely theoretical paper."
    160       }
    161     },
    162     "setup_transparency": {
    163       "model_versions_specified": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No models are used in any experiment."
    167       },
    168       "prompts_provided": {
    169         "applies": false,
    170         "answer": false,
    171         "justification": "No prompting used."
    172       },
    173       "hyperparameters_reported": {
    174         "applies": false,
    175         "answer": false,
    176         "justification": "No experiments."
    177       },
    178       "scaffolding_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The entire paper describes the proposed agentic scaffolding (DALIA) in detail: capability semantic model (Section 3.1), ATDP (Section 3.2), Agent Directory (Section 3.3), deterministic task orchestration (Section 3.4), and execution pipeline (Section 3.5)."
    182       },
    183       "data_preprocessing_documented": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No data collected or processed."
    187       }
    188     },
    189     "limitations_and_scope": {
    190       "limitations_section_present": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Section 5 (Discussion and Future Directions) explicitly acknowledges that 'empirical evaluation is required' and discusses current limitations and future work."
    194       },
    195       "threats_to_validity_specific": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "Section 5 mentions the need for empirical evaluation and richer capability semantics but does not identify specific threats to validity of the proposed architecture (e.g., scalability of deterministic planning, expressiveness limitations of the semantic model)."
    199       },
    200       "scope_boundaries_stated": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section 5 explicitly states: 'DALIA does not prescribe how task graphs are generated internally, nor does it mandate a particular planning algorithm' and notes the need for 'empirical evaluation...particularly when compared to fully prompt-driven approaches.'"
    204       }
    205     },
    206     "data_integrity": {
    207       "raw_data_available": {
    208         "applies": false,
    209         "answer": false,
    210         "justification": "No data collected; theoretical architecture paper."
    211       },
    212       "data_collection_described": {
    213         "applies": false,
    214         "answer": false,
    215         "justification": "No data collection."
    216       },
    217       "recruitment_methods_described": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "No participants or data recruitment."
    221       },
    222       "data_pipeline_documented": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No data pipeline."
    226       }
    227     },
    228     "conflicts_of_interest": {
    229       "funding_disclosed": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No funding or acknowledgments section is present in the paper."
    233       },
    234       "affiliations_disclosed": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "All four authors are listed as affiliated with Universidad de Granada."
    238       },
    239       "funder_independent_of_outcome": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No funding information disclosed; cannot assess funder independence."
    243       },
    244       "financial_interests_declared": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No competing interests or financial interests statement is present."
    248       }
    249     },
    250     "contamination": {
    251       "training_cutoff_stated": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No pre-trained model evaluated on any benchmark."
    255       },
    256       "train_test_overlap_discussed": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No model evaluation on benchmarks."
    260       },
    261       "benchmark_contamination_addressed": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No benchmark evaluation."
    265       }
    266     },
    267     "human_studies": {
    268       "pre_registered": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "irb_or_ethics_approval": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       },
    278       "demographics_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants."
    282       },
    283       "inclusion_exclusion_criteria": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants."
    287       },
    288       "randomization_described": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "No human participants."
    292       },
    293       "blinding_described": {
    294         "applies": false,
    295         "answer": false,
    296         "justification": "No human participants."
    297       },
    298       "attrition_reported": {
    299         "applies": false,
    300         "answer": false,
    301         "justification": "No human participants."
    302       }
    303     },
    304     "cost_and_practicality": {
    305       "inference_cost_reported": {
    306         "applies": false,
    307         "answer": false,
    308         "justification": "Purely theoretical paper; no method with inference cost."
    309       },
    310       "compute_budget_stated": {
    311         "applies": false,
    312         "answer": false,
    313         "justification": "No computation performed."
    314       }
    315     }
    316   },
    317   "red_flags": [
    318     {
    319       "flag": "No empirical evaluation",
    320       "detail": "The paper proposes an architecture (DALIA) with strong claims about reducing hallucinated actions, enabling reproducibility, and improving reliability, but provides zero empirical evidence. The only illustration is a trivial restaurant booking walkthrough."
    321     },
    322     {
    323       "flag": "Claims significantly outrun evidence",
    324       "detail": "The abstract and body claim DALIA 'enables reproducible and verifiable agentic workflows across heterogeneous environments' and 'reduces reliance on speculative reasoning.' These are empirical claims supported only by architectural arguments and a toy scenario."
    325     },
    326     {
    327       "flag": "Trivial illustrative scenario",
    328       "detail": "The restaurant booking scenario (Section 4) involves two sequential operations and one agent. It does not demonstrate the claimed benefits for 'complex agentic and multi-agent systems' or 'heterogeneous environments.'"
    329     }
    330   ],
    331   "cited_papers": [
    332     {
    333       "title": "Why do multi-agent LLM systems fail?",
    334       "authors": [
    335         "Eren Cemri",
    336         "Zhiyang Wu",
    337         "Zekun Liu",
    338         "Yanda Chen"
    339       ],
    340       "year": 2025,
    341       "arxiv_id": "2501.07353",
    342       "relevance": "Empirical taxonomy of MAS failure modes across 1,642 executions of seven frameworks — directly relevant to agentic AI reliability."
    343     },
    344     {
    345       "title": "CODER: Issue resolving with multi-agent and task graphs",
    346       "authors": [
    347         "Dong Chen",
    348         "Shaoxin Lin",
    349         "Muhan Zeng"
    350       ],
    351       "year": 2024,
    352       "arxiv_id": "2406.01304",
    353       "relevance": "Multi-agent task graph approach for software engineering issue resolution."
    354     },
    355     {
    356       "title": "MetaGPT: Meta programming for multi-agent collaborative framework",
    357       "authors": [
    358         "Sheng Hong",
    359         "Cheng Yang"
    360       ],
    361       "year": 2023,
    362       "arxiv_id": "2308.00352",
    363       "relevance": "Prominent multi-agent framework for software development."
    364     },
    365     {
    366       "title": "ChatDev: Communicative agents for software development",
    367       "authors": [
    368         "Chenxi Qian",
    369         "Lei Han"
    370       ],
    371       "year": 2023,
    372       "arxiv_id": "2307.07924",
    373       "relevance": "Multi-agent software development framework using communicative agents."
    374     },
    375     {
    376       "title": "AgentVerse: Facilitating multi-agent collaboration and exploring emergent behaviors",
    377       "authors": [
    378         "Weize Chen",
    379         "Yusheng Su"
    380       ],
    381       "year": 2023,
    382       "relevance": "Multi-agent collaboration framework relevant to agentic AI evaluation."
    383     },
    384     {
    385       "title": "Toolformer: Language models can teach themselves to use tools",
    386       "authors": [
    387         "Timo Schick",
    388         "Jane Dwivedi-Yu"
    389       ],
    390       "year": 2024,
    391       "relevance": "Foundational work on LLM tool use, directly relevant to agentic AI capabilities."
    392     },
    393     {
    394       "title": "LLM-based multi-agent systems for software engineering: Literature review, vision, and the road ahead",
    395       "authors": [
    396         "Junda He",
    397         "Christoph Treude",
    398         "David Lo"
    399       ],
    400       "year": 2025,
    401       "relevance": "Survey of LLM-based MAS for software engineering, identifying reliability and grounding limitations."
    402     },
    403     {
    404       "title": "Generative to agentic AI: Survey, conceptualization, and challenges",
    405       "authors": [
    406         "Jonas Schneider",
    407         "Chittaranjan Marpaka",
    408         "Patric Tegehall"
    409       ],
    410       "year": 2024,
    411       "relevance": "Survey covering the transition from generative to agentic AI paradigms."
    412     },
    413     {
    414       "title": "AFlow: Large language models as multi-agent system engineers",
    415       "authors": [
    416         "Leyang Zhang",
    417         "Bowen Zhang"
    418       ],
    419       "year": 2024,
    420       "arxiv_id": "2502.14321",
    421       "relevance": "LLM-generated multi-agent systems; cited as example of incoherent automated MAS generation."
    422     },
    423     {
    424       "title": "MCPEval: Automatic MCP-based deep evaluation for AI agent models",
    425       "authors": [
    426         "Yanlin Liu",
    427         "Chen Qiao"
    428       ],
    429       "year": 2025,
    430       "relevance": "Evaluation framework for MCP-based AI agents."
    431     }
    432   ],
    433   "engagement_factors": {
    434     "practical_relevance": {
    435       "score": 1,
    436       "justification": "Proposes an architectural pattern for MCP-based agents that practitioners could conceptually adopt, but no implementation, library, or code exists to use."
    437     },
    438     "surprise_contrarian": {
    439       "score": 0,
    440       "justification": "The claim that MAS failures stem from architectural gaps rather than model limitations is a common position in the systems/engineering community, not a surprising finding."
    441     },
    442     "fear_safety": {
    443       "score": 0,
    444       "justification": "No safety, security, or risk concerns are raised; the paper focuses entirely on reliability and architectural structure."
    445     },
    446     "drama_conflict": {
    447       "score": 0,
    448       "justification": "No controversy, no challenge to specific companies or products, and no replication failure — purely a constructive architectural proposal."
    449     },
    450     "demo_ability": {
    451       "score": 0,
    452       "justification": "No code, no implementation, no prototype — only JSON pseudocode snippets illustrating a theoretical architecture."
    453     },
    454     "brand_recognition": {
    455       "score": 0,
    456       "justification": "Authors are from Universidad de Granada with no major industry affiliation, and the work is not associated with any well-known product or lab."
    457     }
    458   }
    459 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs