ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (22179B)


      1 {
      2   "paper": {
      3     "title": "Agentic Artificial Intelligence (AI): Architectures, Taxonomies, and Evaluation of Large Language Model Agents",
      4     "authors": ["Arunkumar V", "Gangadharan G.R.", "Rajkumar Buyya"],
      5     "year": 2026,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2601.12560"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive is provided. The paper is a survey with no analysis scripts or data released."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset is released. The paper surveys existing literature and does not release a structured corpus or extracted data."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specification or dependency list is provided. The paper is a survey with no experiments requiring reproducible setup."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No reproduction instructions are provided. There are no experiments to reproduce, and no methodology artifact (e.g., search query logs, inclusion/exclusion scripts) is released."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "This is a survey/review paper with no original statistical experiments. No CIs or error bars are applicable."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No original comparative experiments are conducted; significance tests do not apply to this survey paper."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No original experiments are run; effect sizes are not applicable to this survey."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No empirical experiment is conducted; sample size justification does not apply."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No original experiments are run; variance reporting does not apply to this survey."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The survey does not compare itself to prior surveys in any systematic way. It briefly mentions prior surveys (Wang et al., Xi et al., Luo et al.) in the introduction but offers no structured comparison of coverage or quality against them."
     64       },
     65       "baselines_contemporary": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No baseline comparison experiments are conducted; this criterion does not apply."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No system is built or ablated; the paper is a survey with no components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No original experiments are conducted; multiple metrics do not apply."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Human evaluation of the survey's own outputs is not relevant; no system outputs are produced."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No train/test split is relevant for a survey paper."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The survey organizes reviewed systems into six taxonomy dimensions (Core Components, Cognitive Architecture, Learning, Multi-Agent Systems, Environments, Evaluation) and provides multiple tables with per-category breakdowns of architectures and frameworks."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 8 ('Challenges and Future Directions') discusses failure modes including hallucination in action, infinite loops, and latency bottlenecks. Tables 1-4 list critical limitations of each system category."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The survey explicitly notes limitations of current approaches: WebArena shows low success rates (< 15%) on long-horizon tasks; synchronous vs asynchronous agent performance drops from 47% to 11% (Section 7.2); prompt-only defenses are described as brittle."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims to propose a unified taxonomy covering Perception, Brain, Planning, Action, Tool Use, and Collaboration, and to review evaluation practices and open challenges. The paper delivers these in Sections 3-8, so the abstract claims are supported."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The paper is a survey that describes and synthesizes existing work; it does not make original causal claims about what causes performance improvements."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes broad claims about 'Agentic AI' and 'LLM-based agents' in general, often reporting results from individual papers (e.g., '30% reduction in bugs' from ChatDev, '50% navigation improvement' from VLM-GroNav) as if they generalize broadly without adequately bounding these to the specific settings and benchmarks in which they were obtained."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": false,
    124         "answer": false,
    125         "justification": "This is a survey/taxonomy paper that presents no original empirical results requiring alternative explanations. NA for pure surveys that do not present original empirical findings."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "No experiments are run; model version specification does not apply to this survey paper."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "The paper does not use prompting in its own methodology; it is a survey."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No experiments are run; hyperparameter reporting does not apply."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not build or deploy an agentic scaffold; it surveys existing scaffolds."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper does not describe any systematic literature search methodology: no search query strings, databases searched, date ranges, inclusion/exclusion criteria, or counts of papers at each screening stage are provided. The review corpus is opaque."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 8 ('Challenges and Future Directions') provides substantive discussion of open challenges and limitations including hallucination in action, infinite loops, latency/cost, alignment, and theoretical limits."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "Section 8 discusses challenges in the field broadly but does not address threats to validity of the survey itself, such as selection bias in the reviewed literature, recency bias, or lack of systematic search methodology."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what it did NOT cover or what claims it is not making about the scope. No explicit exclusion criteria or out-of-scope domains are stated."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data (list of surveyed papers, extraction tables, coding sheets) is released. The survey's evidence base cannot be independently verified."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No systematic literature search process is described. The paper does not explain how papers were identified, what databases were searched, or what time period was covered."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved. The question of recruitment does not apply to this survey paper."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "There is no documented pipeline from paper collection to inclusion in the survey. No filtering stages or criteria are described, making the selection process entirely opaque."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No acknowledgments or funding section is present in the paper. There is no mention of funding sources."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed on the title page: University College of Engineering (Anna University), National Institute of Technology Tiruchirappalli, and University of Melbourne. The paper does not evaluate tools from any of these institutions' proprietary products."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so funder independence cannot be assessed. The question is NA due to apparent lack of external funding."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper is a survey and does not evaluate a pre-trained model on any benchmark. Contamination questions do not apply."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "The paper does not evaluate a pre-trained model on a benchmark; this criterion does not apply."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not run any benchmarks; contamination is not applicable."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this survey paper."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants; IRB approval is not applicable."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants; demographics are not applicable."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants; inclusion/exclusion criteria for participants are not applicable."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants; randomization is not applicable."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants; blinding is not applicable."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants; attrition is not applicable."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "This is a survey paper with no original system being evaluated; inference cost of the survey's own method does not apply."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "This is a survey paper; compute budget for the survey itself is not applicable."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The paper proposes a unified taxonomy decomposing LLM-based agents into six modular dimensions: Core Components, Cognitive Architecture, Learning, Multi-Agent Systems, Environments, and Evaluation.",
    286       "evidence": "Section 3 (Taxonomy) and Figure 1 present this taxonomy. The taxonomy is described as the organizing structure for the paper.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Hierarchical architectures such as ReAcTree maximize task proficiency and reasoning depth but incur exponential increases in token consumption compared to linear chains.",
    291       "evidence": "Section 7.1 and Figure 4 present a multidimensional architectural comparison. Table 4 shows token complexity estimates for different architectures.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Synchronous agents achieve 47% success on asynchronous tasks while performance drops to 11% in asynchronous settings, showing a critical lack of temporal awareness.",
    296       "evidence": "Section 7.2 cites Robotouille benchmark [138]. The paper does not conduct this evaluation itself but reports the finding from the cited work.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "ChatDev achieves a 30% reduction in bugs compared to single-agent coding.",
    301       "evidence": "Section 5.3.1 states this claim citing ChatDev [84]. No details on evaluation methodology or sample size are provided in the survey.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "Supervisory architectures (OVON, Good Parenting) can reduce hallucination rates by up to 100% in controlled environments.",
    306       "evidence": "Section 5.3.2 cites [88] and [89]. The claim of '100% reduction' is extraordinary and the survey provides no methodological details to assess its basis.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "VLM-GroNav improves navigation success by 50% by integrating proprioceptive sensing with vision.",
    311       "evidence": "Section 6.2.3 states this but cites [119] without providing evaluation details, benchmark, or baseline context.",
    312       "supported": "weak"
    313     }
    314   ],
    315   "methodology_tags": ["qualitative", "case-study"],
    316   "key_findings": "This survey proposes a unified architecture-focused taxonomy for LLM-based agents organized around six dimensions: Core Components (perception, memory, action, profiling), Cognitive Architecture (planning, reflection), Learning, Multi-Agent Systems, Environments, and Evaluation. The paper traces the evolution from simple ReAct-style loops to hierarchical multi-agent systems and reasoning models with inference-time compute scaling. It highlights that modern agent evaluation should use the CLASSic framework (Cost, Latency, Accuracy, Security, Stability) and that security threats such as indirect prompt injection are a primary barrier to real-world deployment. Open challenges identified include hallucination in action, infinite loops, computational cost, and human-agent alignment.",
    317   "red_flags": [
    318     {
    319       "flag": "No systematic literature search methodology",
    320       "detail": "The paper provides no description of how papers were identified and selected: no search queries, databases, date ranges, or inclusion/exclusion criteria are stated. This makes it impossible to assess coverage or selection bias, and the survey cannot be replicated or updated systematically."
    321     },
    322     {
    323       "flag": "Uncritical laundering of weak empirical claims",
    324       "detail": "The survey repeats strong-sounding quantitative claims from cited papers (e.g., '30% bug reduction' from ChatDev, '100% hallucination reduction' from Good Parenting, '50% navigation improvement' from VLM-GroNav) without evaluating the methodological quality of the underlying studies or noting their limitations. These numbers are presented as established facts."
    325     },
    326     {
    327       "flag": "No competing interests or funding disclosure",
    328       "detail": "The paper contains no acknowledgments section, no funding statement, and no competing interests declaration."
    329     },
    330     {
    331       "flag": "Overly broad generalizations",
    332       "detail": "Results from narrow, controlled benchmarks or controlled environments are regularly described as general findings about 'Agentic AI' without bounding the scope to the tested conditions. For example, hallucination reduction claims are from 'controlled environments' but are cited in sections discussing general deployment."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    338       "authors": ["S. Yao"],
    339       "year": 2023,
    340       "arxiv_id": "2210.03629",
    341       "relevance": "ReAct is a foundational agentic pattern for LLM agents combining reasoning and acting; highly relevant to the survey's core topic."
    342     },
    343     {
    344       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    345       "authors": ["J. Yang"],
    346       "year": 2024,
    347       "arxiv_id": "2405.15793",
    348       "relevance": "SWE-agent is a prominent benchmark system for LLM-based software engineering agents, directly in the survey scope."
    349     },
    350     {
    351       "title": "A Survey on Large Language Model based Autonomous Agents",
    352       "authors": ["L. Wang"],
    353       "year": 2024,
    354       "relevance": "A major prior survey on LLM-based autonomous agents; relevant for understanding the coverage gap this paper addresses."
    355     },
    356     {
    357       "title": "MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework",
    358       "authors": ["S. Hong"],
    359       "year": 2024,
    360       "relevance": "MetaGPT is a widely cited multi-agent software engineering framework using SOPs; directly in scope for this survey."
    361     },
    362     {
    363       "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models",
    364       "authors": ["G. Wang"],
    365       "year": 2024,
    366       "relevance": "Voyager is a flagship embodied LLM agent for open-ended learning in games; relevant to agentic AI capabilities."
    367     },
    368     {
    369       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    370       "authors": ["N. Shinn"],
    371       "year": 2023,
    372       "relevance": "Reflexion is a key cognitive architecture technique for LLM agent self-improvement via verbal feedback."
    373     },
    374     {
    375       "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models",
    376       "authors": ["S. Yao"],
    377       "year": 2024,
    378       "relevance": "Tree of Thoughts is a major planning architecture for LLM agents; central to the cognitive architecture section."
    379     },
    380     {
    381       "title": "CAMEL: Communicative Agents for 'Mind' Exploration",
    382       "authors": ["G. Li"],
    383       "year": 2023,
    384       "relevance": "CAMEL is an early and influential multi-agent framework; directly in scope for multi-agent LLM systems survey."
    385     },
    386     {
    387       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework",
    388       "authors": ["C. Wang"],
    389       "year": 2024,
    390       "relevance": "AutoGen is a major multi-agent framework for LLM applications; highly relevant to multi-agent systems research."
    391     },
    392     {
    393       "title": "OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments",
    394       "authors": ["T. Xie"],
    395       "year": 2025,
    396       "arxiv_id": "2404.07972",
    397       "relevance": "OSWorld is a key benchmark for evaluating OS-level agentic AI; directly in scope for this survey's evaluation section."
    398     },
    399     {
    400       "title": "GAIA: a benchmark for general AI assistants",
    401       "authors": ["G. Mialon"],
    402       "year": 2023,
    403       "arxiv_id": "2311.12983",
    404       "relevance": "GAIA is a benchmark for general AI assistants requiring multi-step tool use; relevant to agent evaluation."
    405     },
    406     {
    407       "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses",
    408       "authors": ["Y. Liu"],
    409       "year": 2023,
    410       "arxiv_id": "2310.12815",
    411       "relevance": "Foundational work on prompt injection attacks against LLM agents; central to the security section."
    412     },
    413     {
    414       "title": "AgentBench: Evaluating LLMs as Agents",
    415       "authors": ["X. Liu"],
    416       "year": 2024,
    417       "relevance": "AgentBench is a comprehensive multi-environment benchmark for LLM agents; central to the evaluation section of this survey."
    418     },
    419     {
    420       "title": "Large language model agent: A survey on methodology, applications and challenges",
    421       "authors": ["J. Luo"],
    422       "year": 2025,
    423       "arxiv_id": "2503.21460",
    424       "relevance": "A competing/complementary survey on LLM agents from a methodology perspective; directly relevant for understanding the survey landscape."
    425     },
    426     {
    427       "title": "Top of the CLASS: Benchmarking LLM Agents on Real-World Enterprise Tasks",
    428       "authors": ["M. Wornow"],
    429       "year": 2025,
    430       "relevance": "Introduces the CLASSic evaluation framework (Cost, Latency, Accuracy, Security, Stability) adopted by this survey."
    431     }
    432   ]
    433 }

Impressum · Datenschutz