scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19820B)
      1 {
      2   "paper": {
      3     "title": "Large Language Model based Multi-Agents: A Survey of Progress and Challenges",
      4     "authors": ["Taicheng Guo", "Xiuying Chen", "Yaqi Wang", "Ruidi Chang", "Shichao Pei", "Nitesh V. Chawla", "Olaf Wiest", "Xiangliang Zhang"],
      5     "year": 2024,
      6     "venue": "International Joint Conference on Artificial Intelligence",
      7     "arxiv_id": "2402.01680",
      8     "doi": "10.48550/arXiv.2402.01680"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["survey_methodology"],
     12   "methodology_tags": ["meta-analysis"],
     13   "key_findings": "This survey categorizes LLM-based multi-agent systems along four axes: agents-environment interface (sandbox, physical, none), agent profiling (pre-defined, model-generated, data-derived), communication paradigms/structures, and capability acquisition mechanisms. Applications are divided into problem-solving (software development, embodied agents, science experiments, science debate) and world simulation (society, gaming, psychology, economy, policy, disease propagation). The survey identifies key challenges including multi-modal environments, hallucination cascading, collective intelligence acquisition, and scaling.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper maintains an open-source GitHub repository for tracking papers: https://github.com/taichengguo/LLM_MultiAgents_Survey_Papers, mentioned in the abstract and Section 1."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The GitHub repository serves as the curated paper list, which is the survey's primary data artifact. Table 1 and Table 2 summarize the surveyed works and datasets."
     25       },
     26       "environment_specified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "This is a survey paper with no computational experiments requiring environment specifications."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No instructions are provided for how to reproduce the paper selection, filtering, or categorization process. The survey does not describe a systematic search protocol."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "Survey paper with no statistical experiments or aggregated quantitative results."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "Survey paper with no statistical comparisons."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "Survey paper with no experiments."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "Survey paper with no experiments."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "Survey paper with no experiments."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper does not compare against prior surveys on LLM agents (e.g., Xi et al. 2023, Wang et al. 2023b are cited as related but no structured comparison of coverage or methodology is performed)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No experimental baselines in a survey paper."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No system with components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No experimental evaluation."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No system outputs to evaluate."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No experimental evaluation."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 1 provides a detailed per-paper breakdown across multiple dimensions (interface, profiling, communication, capabilities). Figure 1 shows per-category paper counts over time."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6 discusses challenges including hallucination cascading (6.2), limitations of collective intelligence (6.3), scaling difficulties (6.4), and evaluation gaps (6.5)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 6 identifies areas where LLM-MA systems fall short: lack of multi-modal capability (6.1), hallucination propagation (6.2), inability to achieve true collective intelligence (6.3), and missing benchmarks (6.5)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims to provide an in-depth discussion of essential aspects of LLM-MA systems. The paper delivers this through Sections 3-5, covering interface, profiling, communication, capabilities, applications, and resources."
    116       },
    117       "causal_claims_justified": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "The paper is a survey/taxonomy and does not make causal claims about system performance."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper makes broad claims like 'LLM-based Multi-Agents have shown inspiring collective intelligence' (Section 7) without bounding this to specific domains or settings. The title 'A Survey of Progress and Challenges' is appropriately scoped, but the body text often generalizes beyond what the reviewed papers demonstrate."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": false,
    129         "answer": false,
    130         "justification": "Pure survey/taxonomy paper presenting no empirical results of its own."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "Theoretical/survey paper with no measurements of its own."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "Survey paper that does not use models."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "Survey paper that does not use prompting."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "Survey paper with no experiments."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding used in the survey itself."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper does not describe how papers were searched for, selected, or filtered. There is no description of search queries, databases used, inclusion/exclusion criteria, or filtering pipeline. Papers appear to have been collected ad hoc."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. Section 6 discusses challenges of LLM-MA systems themselves, not limitations of the survey's own methodology."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to the validity of the survey itself are discussed. The paper does not acknowledge potential biases in paper selection, coverage gaps, or methodological limitations of the review process."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 1 clearly scopes the survey to LLM-based multi-agent systems, distinguishing from single-agent systems (Section 2.2). The paper explicitly states it focuses on multi-agent systems and organizes coverage around specific domains."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The GitHub repository contains the full list of surveyed papers, allowing verification of the survey's coverage."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper does not describe how the surveyed papers were collected. No search strategy, databases, queries, or time period for the literature search are provided."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data source is published literature, but this is covered by data_collection_described."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No description of how papers were identified, screened, or selected for inclusion. The pipeline from search to final categorization is undocumented."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: University of Notre Dame, KAUST, Southern University of Science and Technology, University of Massachusetts Boston."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information disclosed, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Survey paper that does not evaluate a pre-trained model on any benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Survey paper that does not evaluate a pre-trained model on any benchmark."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Survey paper that does not evaluate a pre-trained model on any benchmark."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "Survey paper with no computational method of its own."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Survey paper with no computational experiments."
    290       }
    291     },
    292     "survey_methodology": {
    293       "prisma_or_structured_protocol": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No structured review protocol is described. No PRISMA diagram, no registered protocol, no reproducible search queries. The paper collection appears ad hoc."
    297       },
    298       "quality_assessment_of_sources": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The survey does not assess the methodological quality of any of the papers it reviews. All papers are treated equally in Table 1 regardless of their rigor, sample sizes, or evaluation quality."
    302       },
    303       "publication_bias_discussed": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No discussion of publication bias. The survey does not consider whether the reviewed papers skew toward positive results or whether negative results in multi-agent systems are underrepresented."
    307       }
    308     }
    309   },
    310   "claims": [
    311     {
    312       "claim": "LLM-based multi-agent systems have achieved considerable progress in complex problem-solving and world simulation.",
    313       "evidence": "Supported by Table 1 listing ~30 papers across software development, embodied agents, science experiments, debate, society simulation, gaming, psychology, economy, policy making, and disease simulation (Sections 4.1-4.2).",
    314       "supported": "moderate"
    315     },
    316     {
    317       "claim": "The volume of research papers in LLM-based multi-agents is rapidly increasing.",
    318       "evidence": "Figure 1 shows paper counts at 3-month intervals, demonstrating growth from early 2023 through late 2023.",
    319       "supported": "strong"
    320     },
    321     {
    322       "claim": "Multi-agent debate can improve factuality and inter-consistency of LLMs.",
    323       "evidence": "Cites Du et al. 2023 showing debate improves factuality on six tasks, and Xiong et al. 2023 claiming debate improves inter-consistency (Section 4.1.4). However, these are cited results, not independently verified.",
    324       "supported": "weak"
    325     },
    326     {
    327       "claim": "LLM-MA systems can effectively mimic real user preferences and behaviors in recommender systems.",
    328       "evidence": "Cites Agent4Rec (Zhang et al. 2023a) using 1000 generative agents initialized with MovieLens-1M data (Section 4.2.5). Single-study evidence.",
    329       "supported": "weak"
    330     }
    331   ],
    332   "red_flags": [
    333     {
    334       "flag": "No systematic review protocol",
    335       "detail": "The survey does not describe any systematic search strategy, inclusion/exclusion criteria, or paper selection methodology. This makes coverage non-reproducible and potentially biased toward the authors' network and awareness."
    336     },
    337     {
    338       "flag": "No quality assessment of reviewed papers",
    339       "detail": "All reviewed papers are presented in Table 1 without any assessment of their methodological quality. A reader cannot distinguish rigorous evaluations from proof-of-concept demos. This risks laundering weak results by presenting them alongside strong ones."
    340     },
    341     {
    342       "flag": "Uncritical presentation of claims",
    343       "detail": "The survey presents claims from reviewed papers (e.g., 'debate improves factuality', 'agents can mimic real user preferences') without critically examining the evidence quality or noting limitations of the original studies."
    344     },
    345     {
    346       "flag": "Self-citation",
    347       "detail": "The survey cites Guo et al. 2023 (first author's own work) as a general reference for LLM capabilities in Section 1, though this appears minor."
    348     }
    349   ],
    350   "cited_papers": [
    351     {
    352       "title": "The Rise and Potential of Large Language Model Based Agents: A Survey",
    353       "authors": ["Zhiheng Xi", "Wenxiang Chen", "Xin Guo"],
    354       "year": 2023,
    355       "arxiv_id": "2309.07864",
    356       "relevance": "Comprehensive survey on single LLM-based agents, directly relevant as the predecessor survey scope."
    357     },
    358     {
    359       "title": "A Survey on Large Language Model based Autonomous Agents",
    360       "authors": ["Lei Wang", "Chen Ma", "Xueyang Feng"],
    361       "year": 2023,
    362       "relevance": "Survey on LLM-based autonomous agents covering memory, planning, and tool use."
    363     },
    364     {
    365       "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
    366       "authors": ["Sirui Hong", "Xiawu Zheng", "Jonathan Chen"],
    367       "year": 2023,
    368       "arxiv_id": "2308.00352",
    369       "relevance": "Major multi-agent framework for software development with shared message pool communication."
    370     },
    371     {
    372       "title": "Communicative Agents for Software Development",
    373       "authors": ["Chen Qian", "Xin Cong", "Wei Liu"],
    374       "year": 2023,
    375       "relevance": "End-to-end multi-agent framework (ChatDev) for software development using role-play."
    376     },
    377     {
    378       "title": "CAMEL: Communicative Agents for 'Mind' Exploration of Large Scale Language Model Society",
    379       "authors": ["Guohao Li", "Hasan Abed Al Kader Hammoud"],
    380       "year": 2023,
    381       "arxiv_id": "2303.17760",
    382       "relevance": "Foundational multi-agent framework using inception prompting for autonomous agent cooperation."
    383     },
    384     {
    385       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework",
    386       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    387       "year": 2023,
    388       "arxiv_id": "2308.08155",
    389       "relevance": "Widely-used multi-agent framework enabling customizable agent interactions."
    390     },
    391     {
    392       "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    393       "authors": ["Joon Sung Park", "Joseph C O'Brien"],
    394       "year": 2023,
    395       "arxiv_id": "2304.03442",
    396       "relevance": "Landmark work on generative agents simulating human behavior in sandbox environments."
    397     },
    398     {
    399       "title": "Improving Factuality and Reasoning in Language Models through Multiagent Debate",
    400       "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba"],
    401       "year": 2023,
    402       "relevance": "Key paper showing multi-agent debate can improve LLM factuality and reasoning."
    403     },
    404     {
    405       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    406       "authors": ["Noah Shinn", "Federico Cassano"],
    407       "year": 2023,
    408       "relevance": "Self-reflection mechanism for LLM agents, foundational for agent capability acquisition."
    409     },
    410     {
    411       "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models",
    412       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"],
    413       "year": 2023,
    414       "relevance": "Key reasoning framework for LLM-based planning and decision-making."
    415     },
    416     {
    417       "title": "AgentVerse: Facilitating Multi-Agent Collaboration and Exploring Emergent Behaviors in Agents",
    418       "authors": ["Weize Chen", "Yusheng Su", "Jingwei Zuo"],
    419       "year": 2023,
    420       "arxiv_id": "2308.10848",
    421       "relevance": "Framework for dynamic multi-agent collaboration with emergent behavior exploration."
    422     },
    423     {
    424       "title": "SOTOPIA: Interactive Evaluation for Social Intelligence in Language Agents",
    425       "authors": ["Xuhui Zhou", "Hao Zhu", "Leena Mathur"],
    426       "year": 2023,
    427       "relevance": "Benchmark for evaluating social intelligence in LLM-based multi-agent interactions."
    428     }
    429   ]
    430 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs