scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (18217B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Exploring Large Language Model based Intelligent Agents: Definitions, Methods, and Prospects",
      6     "authors": [
      7       "Yuheng Cheng",
      8       "Ceyao Zhang",
      9       "Zhengwen Zhang",
     10       "Xiangrui Meng",
     11       "Sirui Hong"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2401.03428",
     16     "doi": "10.48550/arXiv.2401.03428"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims to provide an 'in-depth overview of LLM-based intelligent agents within single-agent and multi-agent systems' covering definitions, frameworks, and components. The paper does deliver this through Sections 2-3 with detailed taxonomies and tables.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "The paper is a survey/taxonomy and does not make causal claims about system performance.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper makes broad claims about LLM-based agents' capabilities and prospects across many domains (natural sciences, social sciences, engineering, military) without bounding the scope of its survey methodology or acknowledging which areas it may have incomplete coverage of.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "This is a taxonomy/survey paper presenting no empirical results that require alternative explanations.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "Theoretical/survey paper with no measurements.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 6.2 'Challenges' discusses intrinsic constraints of LLMs (6.2.1), dynamic scaling (6.2.2), and security and trust (6.2.3). While titled 'Challenges' rather than 'Limitations,' it serves a similar function with substantive discussion.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Section 6.2 discusses challenges of LLM-based agents generally but does not discuss specific threats to the validity of this survey itself — no mention of selection bias, coverage gaps, or limitations of the review methodology.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what it excludes from scope. It covers an extremely broad range of domains without stating boundaries on what was NOT covered or what claims the survey is NOT making.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding sources or acknowledgments section is present in the paper.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are listed: CUHK Shenzhen, DeepWisdom, Peking University, Yantu.ai, FiT Tencent. Notably, co-author Sirui Hong is from DeepWisdom (creators of MetaGPT), and MetaGPT is featured prominently in the survey.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so independence cannot be assessed. Given affiliations with DeepWisdom (MetaGPT) and Tencent, potential conflicts exist but are not addressed.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement is present. Authors affiliated with DeepWisdom (MetaGPT) and Tencent may have financial interests in systems discussed.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper explicitly defines 'intelligent agent,' characterizes LLM-based agents as a formal quintuple V=(L,O,M,A,R), and enumerates the properties of single-agent and multi-agent systems in Section 2.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper clearly states in the abstract and introduction that its contribution is a survey providing 'an in-depth overview of LLM-based intelligent agents within single-agent and multi-agent systems' covering definitions, frameworks, and prospects.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper contextualizes LLM-based agents against classical AI agents and RL-based agents in Section 1, explains why RL-based agents are insufficient, and builds taxonomy structures that organize 300+ referenced works relative to each other.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "survey": {
    120       "search_and_selection": {
    121         "search_strategy_reproducible": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No search strategy is described anywhere; the paper provides no explanation of how papers were identified, which databases were queried, or what queries were used.",
    125           "source": "haiku"
    126         },
    127         "inclusion_exclusion_explicit": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No inclusion or exclusion criteria are stated; it is entirely unclear why certain papers were included while others were not.",
    131           "source": "haiku"
    132         },
    133         "prisma_or_structured_protocol": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No PRISMA flowchart, systematic review protocol, or other structured methodology is mentioned or referenced.",
    137           "source": "haiku"
    138         },
    139         "search_terms_provided": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No search terms, queries, or keywords used to find papers are provided anywhere in the paper.",
    143           "source": "haiku"
    144         },
    145         "databases_listed": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "No databases or sources searched are listed; the paper does not state where or how papers were found.",
    149           "source": "haiku"
    150         },
    151         "screening_process_documented": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No screening process is documented; there are no counts at any stage of paper selection and no discussion of papers considered but excluded.",
    155           "source": "haiku"
    156         },
    157         "review_scope_justified": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "The time range, venue selection, and topical boundaries of the review are never justified; no rationale is given for why the survey covers these specific papers and not others.",
    161           "source": "haiku"
    162         }
    163       },
    164       "synthesis_quality": {
    165         "conflicting_findings_acknowledged": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The paper presents an overwhelmingly positive view of LLM-based agents throughout; conflicting evidence — such as cases where agents fail, regress, or underperform simpler methods — is absent from the synthesis.",
    169           "source": "haiku"
    170         },
    171         "quality_assessment_of_sources": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "All cited papers are treated as equally credible regardless of their methodological quality; there is no rubric, risk-of-bias tool, or structured evaluation of the rigor of reviewed works.",
    175           "source": "haiku"
    176         },
    177         "publication_bias_discussed": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "Publication bias is never mentioned; the survey does not acknowledge that only successful, publishable systems enter the literature it reviews.",
    181           "source": "haiku"
    182         },
    183         "quantitative_synthesis_present": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "The paper is entirely narrative with no meta-analysis, vote-counting, effect size aggregation, or quantitative synthesis of performance across reviewed systems.",
    187           "source": "haiku"
    188         },
    189         "recommendations_supported_by_evidence": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "Section 5's extensive 'Prospect Applications' and per-section future direction bullet points are speculative author opinion; they are not grounded in systematic evidence synthesized from the reviewed literature.",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "LLM-based agents represent a potential path toward artificial general intelligence (AGI)",
    201       "evidence": "Stated in abstract and conclusion as the authors' belief; no systematic evidence or benchmarks against AGI criteria are presented.",
    202       "supported": "unsupported"
    203     },
    204     {
    205       "claim": "LLM-based agents exhibit robust generalization capabilities across various applications due to natural language as interface",
    206       "evidence": "Examples of agent systems across domains are cataloged, but no comparative evaluation of generalization across domains is synthesized.",
    207       "supported": "weak"
    208     },
    209     {
    210       "claim": "RL-based agents have fundamental limitations (training time, sample efficiency, stability, generalizability) that LLM-based agents partially address",
    211       "evidence": "Section 1.2 cites RL limitation literature and Section 1.3 lists LLM advantages; the comparative framing is plausible but not rigorously evaluated.",
    212       "supported": "moderate"
    213     },
    214     {
    215       "claim": "LLM-based agent planning capabilities can be taxonomized into ICL methods, external capability methods, and multi-stage methods",
    216       "evidence": "The taxonomy is developed by systematic categorization of referenced works and illustrated with specific examples for each category; this organizational claim is well-supported.",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "Multi-agent systems are particularly advantageous for tasks spanning multiple domains",
    221       "evidence": "Examples from ChatDev, MetaGPT, and CGMI are given but no systematic comparison to single-agent baselines is synthesized.",
    222       "supported": "weak"
    223     },
    224     {
    225       "claim": "Communication efficiency in MAS can be improved via structured protocols, mediator models, and hallucination mitigation",
    226       "evidence": "Referenced works on ACL protocols, FIPA standards, and hallucination reduction are cited, but no quantitative evidence is synthesized to support the magnitude of improvement.",
    227       "supported": "weak"
    228     }
    229   ],
    230   "methodology_tags": [
    231     "qualitative",
    232     "theoretical"
    233   ],
    234   "key_findings": "This paper provides a broad narrative taxonomy of LLM-based intelligent agent research, organizing single-agent systems by planning, memory, rethinking, and action components, and multi-agent systems by relationship type (cooperative, competitive, hierarchical) and planning paradigm (CPDE vs DPDE). It catalogs over 300 papers across application domains including natural sciences, social sciences, and engineering. However, the paper lacks any systematic search methodology, quality assessment of sources, or quantitative synthesis, making it an informal literature review rather than a rigorous systematic survey; its AGI framing and uniformly positive stance are unsupported by the evidence it presents.",
    235   "red_flags": [
    236     {
    237       "flag": "No search methodology",
    238       "detail": "The paper provides no description of how papers were identified, searched, or selected; it reads as an informal curated overview rather than a systematic survey, making replication impossible."
    239     },
    240     {
    241       "flag": "No source quality assessment",
    242       "detail": "All reviewed papers are treated as equally credible; a GitHub README (AutoGPT) and a Nature paper (Coscientist) receive identical treatment without any evaluation of methodological rigor."
    243     },
    244     {
    245       "flag": "AGI overclaiming",
    246       "detail": "The abstract and conclusion frame LLM-based agents as 'a potential path toward artificial general intelligence' without any benchmarks or evidence supporting this claim."
    247     },
    248     {
    249       "flag": "Speculative prospects section",
    250       "detail": "Section 5 spans ~15 pages of future applications across every conceivable domain, presenting speculative possibilities alongside actual demonstrated capabilities without distinguishing them."
    251     },
    252     {
    253       "flag": "No funding disclosure",
    254       "detail": "No acknowledgment of funding sources appears; co-author Sirui Hong is affiliated with DeepWisdom (commercial AI company), whose product MetaGPT is prominently featured and co-authored by paper authors."
    255     },
    256     {
    257       "flag": "Self-citation promotion",
    258       "detail": "MetaGPT (co-authored by paper author Sirui Hong from DeepWisdom) is featured throughout the paper in planning, memory, MAS relationship, and application sections without disclosure of this conflict."
    259     }
    260   ],
    261   "cited_papers": [
    262     {
    263       "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    264       "relevance": "Central example for multi-agent cooperative systems and memory mechanisms; heavily cited across planning, memory, and MAS sections."
    265     },
    266     {
    267       "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
    268       "relevance": "Key example of multi-agent software development with shared memory; co-authored by a paper author, raising undisclosed conflict of interest."
    269     },
    270     {
    271       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    272       "relevance": "Foundational ICL rethinking method; central to the planning and action taxonomy."
    273     },
    274     {
    275       "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models",
    276       "relevance": "Primary example for lifelong learning agents and skill-based long-term memory in game environments."
    277     },
    278     {
    279       "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models",
    280       "relevance": "Core ICL planning methodology; anchor for the entire planning capability taxonomy."
    281     },
    282     {
    283       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    284       "relevance": "Key rethinking mechanism discussed in both ICL and RL-based rethinking sections."
    285     },
    286     {
    287       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework",
    288       "relevance": "Primary example of hierarchical MAS; discussed in relationship types and agent system templates."
    289     },
    290     {
    291       "title": "AgentBench: Evaluating LLMs as Agents",
    292       "relevance": "Benchmark for holistic agent capability assessment; discussed in evaluation trends section."
    293     },
    294     {
    295       "title": "ToolLLM: Facilitating Large Language Models to Master 16000+ Real-World APIs",
    296       "relevance": "Used as both a tool-use methodology example and an agent system template reference."
    297     },
    298     {
    299       "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models",
    300       "relevance": "Key ICL planning method in the planning capability taxonomy."
    301     }
    302   ],
    303   "engagement_factors": {
    304     "practical_relevance": {
    305       "score": 2,
    306       "justification": "Provides a useful taxonomy of agent components and references to available open-source frameworks (LangChain, AutoGPT, MetaGPT, BabyAGI) that practitioners can directly use."
    307     },
    308     "surprise_contrarian": {
    309       "score": 0,
    310       "justification": "The paper presents a conventional, positive view of LLM agents aligned with consensus; no surprising findings or contrarian claims are made."
    311     },
    312     "fear_safety": {
    313       "score": 1,
    314       "justification": "Section 6.2.3 briefly raises security and trust issues including excessive permissions and hallucinations, but the treatment is superficial."
    315     },
    316     "drama_conflict": {
    317       "score": 0,
    318       "justification": "No controversy, no competing claims, no critical stance toward any work; the paper is uniformly positive about the field."
    319     },
    320     "demo_ability": {
    321       "score": 2,
    322       "justification": "Many referenced systems (AutoGPT, LangChain, MetaGPT, BabyAGI, AgentGPT) are open-source and publicly accessible for immediate demonstration."
    323     },
    324     "brand_recognition": {
    325       "score": 2,
    326       "justification": "References GPT-4 extensively, features MetaGPT and ChatDev from recognized groups, and covers well-known frameworks; CUHK Shenzhen is a recognized institution."
    327     }
    328   },
    329   "hn_data": {
    330     "threads": [
    331       {
    332         "hn_id": "39294383",
    333         "title": "Escalation Risks from Language Models in Military and Diplomatic Decision-Making",
    334         "points": 52,
    335         "comments": 12,
    336         "url": "https://news.ycombinator.com/item?id=39294383",
    337         "created_at": "2024-02-07T21:12:01Z"
    338       },
    339       {
    340         "hn_id": "39279295",
    341         "title": "Escalation Risks from Language Models in Military and Diplomatic Decision-Making",
    342         "points": 2,
    343         "comments": 0,
    344         "url": "https://news.ycombinator.com/item?id=39279295",
    345         "created_at": "2024-02-06T19:30:13Z"
    346       }
    347     ],
    348     "top_points": 52,
    349     "total_points": 54,
    350     "total_comments": 12
    351   }
    352 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs