scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (18390B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Exploring Large Language Model based Intelligent Agents: Definitions, Methods, and Prospects",
      6     "authors": [
      7       "Yuheng Cheng",
      8       "Ceyao Zhang",
      9       "Zhengwen Zhang",
     10       "Xiangrui Meng",
     11       "Sirui Hong"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2401.03428",
     16     "doi": "10.48550/arXiv.2401.03428"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract promises an in-depth overview covering definitions, frameworks, foundational components, multi-agent mechanisms, datasets, applications, and prospects — all of which are delivered in the body of the paper across Sections 2–6.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "The paper is a descriptive narrative survey; it does not make original causal claims or run experiments. Statements like 'incorporating agent mechanisms can facilitate the challenges' describe others' reported findings, not the authors' own causal inference.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper makes sweeping claims such as LLM-based agents 'exhibit robust generalization capabilities across various applications' and offers prospects across biology, climate, military, and economics without bounding these to specific conditions, paper counts, or evidence strength.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The survey presents each technique and application area in a uniformly positive light; competing hypotheses (e.g., whether LLM agents actually generalize better than RL agents) and alternative interpretations of results are not discussed.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "The paper does not conduct original empirical analysis, so proxy-vs-real-outcome conflation is not applicable; it describes what external studies measured without making its own measurement claims.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Section 6.2 is titled 'Challenges' and covers intrinsic LLM constraints, dynamic scaling, and security, but this addresses challenges for the research field, not limitations or threats-to-validity of the survey itself as a review methodology.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to the survey's validity are discussed — no mention of selection bias, publication bias, scope limitations of the literature covered, or the non-systematic nature of the paper selection process.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper never explicitly states what it excludes or where its coverage ends; there is no statement of what time period, venues, or paper types were included or omitted.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment appears anywhere in the provided paper text; funding sources are entirely absent.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are listed on the title page: CUHK Shenzhen, DeepWisdom, Peking University, Yantu.ai, and Tencent FiT.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funder is disclosed, making this criterion not applicable.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement, no declaration of patents, equity, or consulting relationships appears in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 1.1 defines 'agent' and its five characteristics; Section 2.1 formally defines a single LLM-based agent as a quintuple (L, O, M, A, R); multi-agent systems are defined with reference to standard taxonomies in Section 2.2.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The abstract and Section 1.3 explicitly state the paper surveys current research to provide an in-depth overview, covering definitions, frameworks, components, multi-agent mechanisms, datasets, applications, and prospects.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper cites over 330 references and organizes them into structured typologies (Figures 4–11), contextualizing each approach relative to alternatives and situating the LLM-agent paradigm against RL-based predecessors.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "survey": {
    120       "search_and_selection": {
    121         "search_strategy_reproducible": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No search strategy is described anywhere in the paper; there is no mention of how papers were identified, which databases were queried, or what queries were used.",
    125           "source": "haiku"
    126         },
    127         "inclusion_exclusion_explicit": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No inclusion or exclusion criteria are stated; papers appear to have been selected by the authors' discretion without documented criteria.",
    131           "source": "haiku"
    132         },
    133         "prisma_or_structured_protocol": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "PRISMA or any other structured review protocol is not mentioned; the paper is an informal narrative survey.",
    137           "source": "haiku"
    138         },
    139         "search_terms_provided": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No search terms or queries are provided anywhere in the paper.",
    143           "source": "haiku"
    144         },
    145         "databases_listed": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "No databases or sources (arXiv, ACL Anthology, Google Scholar, etc.) are listed as having been searched.",
    149           "source": "haiku"
    150         },
    151         "screening_process_documented": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No PRISMA-style flow diagram or staged counts of papers identified, screened, and included are provided.",
    155           "source": "haiku"
    156         },
    157         "review_scope_justified": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "The paper does not justify why it covers the time period it does, which venues are included, or why particular application domains were selected over others.",
    161           "source": "haiku"
    162         }
    163       },
    164       "synthesis_quality": {
    165         "conflicting_findings_acknowledged": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The paper presents each system and domain area descriptively and positively; conflicting empirical findings across reviewed papers (e.g., cases where LLM agents underperform simpler baselines) are not acknowledged.",
    169           "source": "haiku"
    170         },
    171         "quality_assessment_of_sources": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "All cited works are treated uniformly regardless of venue, methodology, or rigor; GitHub repositories are cited alongside peer-reviewed papers without quality differentiation.",
    175           "source": "haiku"
    176         },
    177         "publication_bias_discussed": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "Publication bias is never mentioned; the positive framing throughout the paper does not acknowledge that the literature may systematically over-report successful agent applications.",
    181           "source": "haiku"
    182         },
    183         "quantitative_synthesis_present": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "The survey is entirely narrative; no meta-analysis, vote counting, effect size aggregation, or any form of quantitative synthesis is performed.",
    187           "source": "haiku"
    188         },
    189         "recommendations_supported_by_evidence": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "Sections 5 and 6 offer numerous future research recommendations (e.g., 'LLM-based agents demonstrate substantial promise in upcoming mathematical research') that are speculative and not derived from evidence synthesis across reviewed papers.",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "LLM-based agents exhibit robust generalization capabilities across various applications compared to RL-based agents.",
    201       "evidence": "The paper contrasts RL-based agents' specialization limitations (Section 1.2) with LLMs' zero-shot/few-shot generalization (Section 1.3), citing GPT-4 and general-purpose systems like HuggingGPT and AutoGPT.",
    202       "supported": "weak"
    203     },
    204     {
    205       "claim": "Multi-agent systems are particularly advantageous for tasks spanning multiple domains due to specialized per-agent expertise.",
    206       "evidence": "Section 2.2 argues each agent in MAS typically has domain expertise; examples include ChatDev and MetaGPT for software roles and Boiko et al. for chemistry.",
    207       "supported": "moderate"
    208     },
    209     {
    210       "claim": "Context length constraint and hallucination are the primary intrinsic limitations of LLMs that agent mechanisms can partially address.",
    211       "evidence": "Section 1.3 and 6.2.1 list context length, knowledge update lag, and no direct tool use as limitations; agent mechanisms like memory and tool use are offered as mitigations.",
    212       "supported": "moderate"
    213     },
    214     {
    215       "claim": "Centralized Planning Decentralized Execution (CPDE) offers global optimization but risks single-point failure and poor real-time adaptability.",
    216       "evidence": "Section 3.2.2 explicitly discusses CPDE merits and limitations including computational complexity and vulnerability to single-point failures.",
    217       "supported": "moderate"
    218     },
    219     {
    220       "claim": "There is currently no widely used benchmark for LLM-based agents.",
    221       "evidence": "Section 4.2 states: 'Currently, there is no widely used benchmark for LLM-based agents, although some studies engage in comparative analysis.'",
    222       "supported": "strong"
    223     },
    224     {
    225       "claim": "LLM-based agents can simulate credible human behavior in social, economic, and psychological contexts.",
    226       "evidence": "Section 5.3 cites Generative Agents, Horton's economic simulations, and Aher et al.'s psychological experiment replication as evidence.",
    227       "supported": "weak"
    228     }
    229   ],
    230   "methodology_tags": [
    231     "theoretical",
    232     "qualitative"
    233   ],
    234   "key_findings": "This is a broad narrative survey organizing the 2023–2024 LLM-based agent literature into a framework of single-agent components (planning, memory, rethinking, environments, action) and multi-agent system patterns (cooperative/competitive/hierarchical relationships, CPDE vs DPDE planning paradigms, communication efficiency strategies). The paper identifies no widely accepted benchmark for LLM agents and highlights three persistent open challenges: intrinsic LLM limitations (context length, hallucination), dynamic scaling in multi-agent systems, and security/trust. The survey covers application prospects across natural sciences, social sciences, and engineering systems but offers no systematic evidence synthesis — it is essentially a well-organized literature catalog with speculative future-directions sections.",
    235   "red_flags": [
    236     {
    237       "flag": "No systematic search methodology",
    238       "detail": "The paper provides no search strategy, databases searched, search terms, inclusion/exclusion criteria, or PRISMA flow diagram. It is a narrative review passing as a survey."
    239     },
    240     {
    241       "flag": "GitHub repos cited as primary sources",
    242       "detail": "Tables 1 and 2 include GitHub repositories (AutoGPT, BabyAGI, AGiXT, LoopGPT, SmolModels, DemoGPT, WorkGPT) alongside peer-reviewed papers with no quality differentiation."
    243     },
    244     {
    245       "flag": "No quality assessment of sources",
    246       "detail": "All ~330 cited works are treated as equally credible; workshop papers, preprints, and GitHub repos receive the same weight as ICLR/NeurIPS publications."
    247     },
    248     {
    249       "flag": "Speculative prospects presented as imminent",
    250       "detail": "Section 5 presents extensive future applications (military AI, climate modeling, drug discovery) with optimistic framing unsupported by systematic evidence from the reviewed papers."
    251     },
    252     {
    253       "flag": "No funding disclosure",
    254       "detail": "No acknowledgment section or funding statement appears in the paper despite authors affiliated with a major tech company (Tencent FiT)."
    255     },
    256     {
    257       "flag": "No limitations section for survey methodology",
    258       "detail": "Section 6.2 addresses technical challenges in the field, not the survey's own methodological limitations, scope boundaries, or potential blind spots in coverage."
    259     }
    260   ],
    261   "cited_papers": [
    262     {
    263       "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    264       "relevance": "Core reference for multi-agent sociological simulation; central example in Sections 3.1.2, 3.2.1, and 5.3.3."
    265     },
    266     {
    267       "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
    268       "relevance": "Primary example of cooperative multi-agent software development used throughout Sections 3.2.1, 3.2.2, and 5.3.7."
    269     },
    270     {
    271       "title": "ChatDev: Communicative Agents for Software Development",
    272       "relevance": "Key cooperative MAS example used in planning, memory, and multi-agent relationship sections."
    273     },
    274     {
    275       "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models",
    276       "relevance": "Primary example of lifelong learning in gaming environments; cited in planning, memory, and environments sections."
    277     },
    278     {
    279       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    280       "relevance": "Foundational rethinking/in-context learning method referenced in Section 3.1.3."
    281     },
    282     {
    283       "title": "ToolLLM: Facilitating Large Language Models to Master 16000+ Real-World APIs",
    284       "relevance": "Tool planning framework and benchmark cited in Sections 2.3, 3.1.5, and 4.2."
    285     },
    286     {
    287       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework",
    288       "relevance": "Leading hierarchical MAS framework cited in Sections 2.3 and 3.2.1."
    289     },
    290     {
    291       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    292       "relevance": "Key rethinking method combining ICL and self-reflection cited in Sections 3.1.2 and 3.1.3."
    293     },
    294     {
    295       "title": "AgentBench: Evaluating LLMs as Agents",
    296       "relevance": "Cited in Section 6.1 as the most comprehensive evaluation platform for agent foundational capabilities."
    297     },
    298     {
    299       "title": "HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace",
    300       "relevance": "Primary example of tool-use and multi-model orchestration in single-agent action sections."
    301     }
    302   ],
    303   "engagement_factors": {
    304     "practical_relevance": {
    305       "score": 2,
    306       "justification": "The taxonomy of agent components and multi-agent patterns provides practitioners with a useful organizational map of the 2023 LLM-agent landscape."
    307     },
    308     "surprise_contrarian": {
    309       "score": 0,
    310       "justification": "The paper is entirely descriptive and confirmatory; it does not challenge any conventional wisdom or report unexpected findings."
    311     },
    312     "fear_safety": {
    313       "score": 1,
    314       "justification": "Section 6.2.3 briefly raises security and trust concerns for LLM agents, and Section 5.4.7 touches on military AI ethics, but neither is developed in depth."
    315     },
    316     "drama_conflict": {
    317       "score": 0,
    318       "justification": "No controversy, competitive framing, or adversarial positioning; the paper reads as a neutral catalog."
    319     },
    320     "demo_ability": {
    321       "score": 2,
    322       "justification": "Tables 1 and 2 include numerous open-source projects (AutoGPT, LangChain, AgentVerse, AutoGen) that readers can immediately try."
    323     },
    324     "brand_recognition": {
    325       "score": 1,
    326       "justification": "Authors are from CUHK Shenzhen and Tencent FiT, known institutions but not the most prominent AI labs; co-author Sirui Hong is associated with MetaGPT which has some recognition."
    327     }
    328   },
    329   "hn_data": {
    330     "threads": [
    331       {
    332         "hn_id": "39294383",
    333         "title": "Escalation Risks from Language Models in Military and Diplomatic Decision-Making",
    334         "points": 52,
    335         "comments": 12,
    336         "url": "https://news.ycombinator.com/item?id=39294383",
    337         "created_at": "2024-02-07T21:12:01Z"
    338       },
    339       {
    340         "hn_id": "39279295",
    341         "title": "Escalation Risks from Language Models in Military and Diplomatic Decision-Making",
    342         "points": 2,
    343         "comments": 0,
    344         "url": "https://news.ycombinator.com/item?id=39279295",
    345         "created_at": "2024-02-06T19:30:13Z"
    346       }
    347     ],
    348     "top_points": 52,
    349     "total_points": 54,
    350     "total_comments": 12
    351   }
    352 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs