ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (18250B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Large Language Model based Multi-Agents: A Survey of Progress and Challenges",
      6     "authors": [
      7       "Taicheng Guo",
      8       "Xiuying Chen",
      9       "Yaqi Wang",
     10       "Ruidi Chang",
     11       "Shichao Pei"
     12     ],
     13     "year": 2024,
     14     "venue": "International Joint Conference on Artificial Intelligence",
     15     "arxiv_id": "2402.01680",
     16     "doi": "10.48550/arXiv.2402.01680"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims LLM-MA systems have achieved progress in problem-solving and world simulation, and the body reviews dozens of papers across software development, societal simulation, gaming, and other domains that substantiate these claims.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "This survey does not conduct original empirical work or make novel causal claims; all causal language (e.g., 'debating can improve factuality') is attributed to cited primary papers, not the survey's own study design.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The survey makes broad statements such as 'multi-agent systems offer advanced capabilities' and 'LLM-MA has achieved considerable progress' without bounding these to specific settings, model families, task types, or evidence quality levels.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss whether apparent gains from multi-agent architectures could be explained by greater total compute, prompt engineering differences, or cherry-picked benchmarks; it presents improvements as straightforwardly attributable to multi-agent design.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper treats MMLU, HumanEval, and game-winning rates as direct evidence of reasoning and collaboration capability without acknowledging these are proxies whose relationship to real-world multi-agent utility is contested.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Section 6 ('Challenges and Opportunities') discusses open problems for the field, not limitations of this survey's own methodology, coverage, or potential biases in paper selection.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to the validity of this survey are identified—no discussion of selection bias, English-language bias, recency bias toward arXiv preprints, or the survey's own coverage gaps.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "While the survey focuses on LLM-based multi-agent systems, it does not state explicit temporal bounds, venue restrictions, or principled exclusion criteria that define what is outside its scope.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No acknowledgment section or funding disclosure appears in the provided text; funding sources are entirely absent.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All five authors list institutional affiliations (Notre Dame, KAUST, SUSTech, UMass Boston) on the title page.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Funding is not disclosed, so independence of the funder cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) appears anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper defines 'multi-agent system' in contrast to single-agent, and Section 3 operationalizes key sub-concepts: agents-environment interface types (Sandbox, Physical, None), profiling methods (Pre-defined, Model-Generated, Data-Derived), and communication paradigms (Cooperative, Debate, Competitive).",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The introduction explicitly states four organizing questions the survey addresses (domains/environments, profiling, communication, capability acquisition) and describes the intended output as a comprehensive overview with taxonomy, resources, and challenge identification.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper explicitly references two prior surveys ([Xi et al., 2023; Wang et al., 2023b]) and positions itself as complementing them 'by tackling unresolved questions,' specifically the multi-agent (vs. single-agent) angle.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "survey": {
    120       "search_and_selection": {
    121         "search_strategy_reproducible": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No search strategy is described anywhere; there is no mention of how papers were identified, what queries were run, or what sources were consulted.",
    125           "source": "haiku"
    126         },
    127         "inclusion_exclusion_explicit": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No inclusion or exclusion criteria are stated; the paper simply presents a curated set of works with no explanation of what was considered or rejected.",
    131           "source": "haiku"
    132         },
    133         "prisma_or_structured_protocol": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No structured review protocol (PRISMA, PRISMA-ScR, or otherwise) is mentioned or followed.",
    137           "source": "haiku"
    138         },
    139         "search_terms_provided": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No search terms or queries of any kind are provided.",
    143           "source": "haiku"
    144         },
    145         "databases_listed": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "No databases or sources (arXiv, Semantic Scholar, ACL Anthology, etc.) are listed as having been searched.",
    149           "source": "haiku"
    150         },
    151         "screening_process_documented": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No screening process with counts at any stage is documented; the paper goes directly from motivation to taxonomy without describing how the reviewed papers were assembled.",
    155           "source": "haiku"
    156         },
    157         "review_scope_justified": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "The temporal and venue scope of the review is never justified; Figure 1 shows paper counts at 3-month intervals but does not explain what corpus was used to generate those counts.",
    161           "source": "haiku"
    162         }
    163       },
    164       "synthesis_quality": {
    165         "conflicting_findings_acknowledged": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The paper does not acknowledge conflicting results across reviewed studies (e.g., whether multi-agent debate consistently helps or hurts, or when single-agent outperforms multi-agent); it presents each application domain's findings as uniformly positive.",
    169           "source": "haiku"
    170         },
    171         "quality_assessment_of_sources": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No quality rubric, risk-of-bias assessment, or structured evaluation of source papers is applied; papers from peer-reviewed venues and anonymous workshop submissions are treated identically (Table 1 lists '[Anonymous, 2023]' without comment).",
    175           "source": "haiku"
    176         },
    177         "publication_bias_discussed": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "Publication bias is never mentioned; the survey does not acknowledge that reviewed papers skew toward positive results or that failed multi-agent experiments are unlikely to be published.",
    181           "source": "haiku"
    182         },
    183         "quantitative_synthesis_present": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "The survey provides paper counts in Figure 1 and a taxonomy table (Table 1) but no quantitative synthesis of results, effect sizes, or vote-counting across comparable evaluations.",
    187           "source": "haiku"
    188         },
    189         "recommendations_supported_by_evidence": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "Section 6 recommendations (multi-modal integration, hallucination mitigation, scaling) are stated as future directions based on narrative reading rather than evidence-grounded synthesis; no reviewed evidence is marshalled to prioritize one direction over another.",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "LLM-based multi-agent systems have achieved considerable progress in complex problem-solving and world simulation.",
    201       "evidence": "The paper reviews dozens of papers across software development, embodied AI, science experiments, societal simulation, and gaming domains.",
    202       "supported": "weak"
    203     },
    204     {
    205       "claim": "Multi-agent systems offer advanced capabilities over single-agent systems by specializing LLMs into distinct roles and enabling inter-agent interactions.",
    206       "evidence": "Described conceptually in Section 2.2 and illustrated via Table 1; no controlled comparison between single- and multi-agent performance is synthesized.",
    207       "supported": "weak"
    208     },
    209     {
    210       "claim": "Research volume in LLM-based multi-agents is rapidly increasing.",
    211       "evidence": "Figure 1 shows paper counts at 3-month intervals across categories, indicating growth trends.",
    212       "supported": "moderate"
    213     },
    214     {
    215       "claim": "Multi-agent debate can improve factuality and reasoning in LLMs.",
    216       "evidence": "Attributed to Du et al. (2023) and Xiong et al. (2023); the survey does not independently verify or quantify this claim.",
    217       "supported": "weak"
    218     },
    219     {
    220       "claim": "There is a notable shortfall in comprehensive benchmarks for science team operations, economic analysis, and disease propagation simulation.",
    221       "evidence": "Table 2 shows no data links or benchmarks available for several world simulation domains.",
    222       "supported": "moderate"
    223     }
    224   ],
    225   "methodology_tags": [
    226     "qualitative"
    227   ],
    228   "key_findings": "This survey provides a four-dimension taxonomy of LLM-based multi-agent systems covering agents-environment interface (Sandbox, Physical, None), agent profiling methods (Pre-defined, Model-Generated, Data-Derived), communication structures (layered, decentralized, centralized, shared pool), and capability acquisition mechanisms (memory, self-evolution, dynamic generation). Applications are organized into problem-solving (software development, embodied agents, science experiments, science debate) and world simulation (societal, gaming, psychology, economics, policy, disease propagation). Key challenges identified include hallucination cascades in multi-agent settings, difficulty acquiring collective intelligence, scaling computational costs, and the absence of comprehensive benchmarks across many application domains. The survey is purely narrative with no systematic search methodology.",
    229   "red_flags": [
    230     {
    231       "flag": "No systematic search methodology",
    232       "detail": "The paper is presented as a 'survey' but describes no search strategy, databases searched, search terms, or inclusion/exclusion criteria—it is effectively a curated reading list."
    233     },
    234     {
    235       "flag": "No survey-level limitations section",
    236       "detail": "Section 6 discusses field-level challenges but never addresses the survey's own limitations such as coverage gaps, selection bias toward successful systems, or temporal scope."
    237     },
    238     {
    239       "flag": "No funding disclosure",
    240       "detail": "No acknowledgment or funding section appears in the paper, leaving potential conflicts of interest undisclosed."
    241     },
    242     {
    243       "flag": "No quality assessment of sources",
    244       "detail": "Peer-reviewed papers, arXiv preprints, and anonymous under-review submissions are cited interchangeably with no quality differentiation."
    245     },
    246     {
    247       "flag": "Uncritical synthesis of positive results",
    248       "detail": "The paper presents each domain's findings as uniformly progressive without acknowledging null results, failures, or conflicting evidence about when multi-agent systems do not outperform single-agent baselines."
    249     },
    250     {
    251       "flag": "Publication bias ignored",
    252       "detail": "No discussion of the likelihood that negative multi-agent results go unpublished, which would inflate the apparent success rate of the reviewed systems."
    253     }
    254   ],
    255   "cited_papers": [
    256     {
    257       "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    258       "relevance": "Foundational societal simulation work with 25-agent sandbox; directly reviewed as a core LLM-MA application."
    259     },
    260     {
    261       "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
    262       "relevance": "Key software development multi-agent framework reviewed; introduces shared message pool communication structure."
    263     },
    264     {
    265       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework",
    266       "relevance": "One of three open-source frameworks described in Section 5.1 as implementation tools for LLM-MA."
    267     },
    268     {
    269       "title": "Improving Factuality and Reasoning in Language Models through Multiagent Debate",
    270       "relevance": "Core empirical basis for the science debate application category; claimed to show debate improves factuality."
    271     },
    272     {
    273       "title": "A Survey on Large Language Model Based Autonomous Agents",
    274       "relevance": "Prior survey explicitly positioned against; this paper claims to complement it by focusing on multi-agent rather than single-agent systems."
    275     },
    276     {
    277       "title": "RoCo: Dialectic Multi-Robot Collaboration with Large Language Models",
    278       "relevance": "Representative embodied agents paper reviewed under multi-robot collaboration category."
    279     },
    280     {
    281       "title": "The Rise and Potential of Large Language Model Based Agents: A Survey",
    282       "relevance": "Second prior survey used to position this work; both are cited as existing reviews this paper complements."
    283     },
    284     {
    285       "title": "CAMEL: Communicative Agents for Mind Exploration of Large Scale Language Model Society",
    286       "relevance": "One of three frameworks reviewed in Section 5.1; introduces inception prompting for autonomous agent cooperation."
    287     },
    288     {
    289       "title": "Communicative Agents for Software Development (ChatDev)",
    290       "relevance": "Core software development multi-agent paper reviewed; models SOP-based agent collaboration for code generation."
    291     },
    292     {
    293       "title": "Welfare Diplomacy: Benchmarking Language Model Cooperation",
    294       "relevance": "Reviewed under gaming simulation; introduces open-source benchmark for multi-agent cooperation in game-theoretic settings."
    295     }
    296   ],
    297   "engagement_factors": {
    298     "practical_relevance": {
    299       "score": 2,
    300       "justification": "The taxonomy, framework overview (MetaGPT, CAMEL, AutoGen), and dataset table (Table 2) give practitioners useful orientation for building or evaluating LLM-MA systems."
    301     },
    302     "surprise_contrarian": {
    303       "score": 0,
    304       "justification": "The paper affirms the mainstream view that multi-agent systems are beneficial and rapidly progressing; no findings challenge conventional wisdom."
    305     },
    306     "fear_safety": {
    307       "score": 1,
    308       "justification": "Hallucination cascades in multi-agent settings (Section 6.2) are flagged as a safety-relevant concern, but the treatment is brief and non-alarming."
    309     },
    310     "drama_conflict": {
    311       "score": 0,
    312       "justification": "Neutral taxonomic survey with no controversy, competing claims, or adversarial framing."
    313     },
    314     "demo_ability": {
    315       "score": 2,
    316       "justification": "Points to three major open-source frameworks (MetaGPT, CAMEL, AutoGen) and an open GitHub repository, enabling immediate exploration."
    317     },
    318     "brand_recognition": {
    319       "score": 1,
    320       "justification": "University of Notre Dame and KAUST are respected institutions but not famous AI labs; no marquee industry affiliation."
    321     }
    322   },
    323   "hn_data": {
    324     "threads": [
    325       {
    326         "hn_id": "47139716",
    327         "title": "Large-Scale Online Deanonymization with LLMs",
    328         "points": 364,
    329         "comments": 234,
    330         "url": "https://news.ycombinator.com/item?id=47139716",
    331         "created_at": "2026-02-24T17:18:17Z"
    332       },
    333       {
    334         "hn_id": "47154011",
    335         "title": "Large-scale online deanonymization with LLMs",
    336         "points": 78,
    337         "comments": 2,
    338         "url": "https://news.ycombinator.com/item?id=47154011",
    339         "created_at": "2026-02-25T16:44:58Z"
    340       }
    341     ],
    342     "top_points": 364,
    343     "total_points": 442,
    344     "total_comments": 236
    345   }
    346 }

Impressum · Datenschutz