ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (20806B)


      1 {
      2   "paper": {
      3     "title": "A survey on LLM-based multi-agent systems: workflow, infrastructure, and challenges",
      4     "authors": ["Xinyi Li", "Sai Wang", "Siqi Zeng", "Yu Wu", "Yi Yang"],
      5     "year": 2024,
      6     "venue": "Vicinagearth",
      7     "doi": "10.1007/s44336-024-00009-2"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["survey_methodology"],
     11   "methodology_tags": ["meta-analysis"],
     12   "key_findings": "This survey synthesizes LLM-based multi-agent systems into a five-component framework (profile, perception, self-action, mutual interaction, evolution) and categorizes applications into problem-solving and world simulation. It reviews representative works across software development, embodied agents, science experiments, gaming, societal simulation, and recommender systems. The paper identifies key challenges including LLM intrinsic constraints (hallucination, bias, black-box effects), scaling difficulties, and dynamic environment adaptation.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No code or analysis scripts are released. The paper states 'Code availability: Not applicable.'"
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No dataset of surveyed papers, extracted data tables, or supplementary data files are provided. The paper states 'The authors confirm that the data and materials supporting the findings of this study are available within the article.'"
     24       },
     25       "environment_specified": {
     26         "applies": false,
     27         "answer": false,
     28         "justification": "This is a survey paper with no computational experiments requiring environment specification."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No instructions for reproducing the survey methodology (search queries, databases used, date ranges, inclusion/exclusion criteria) are provided."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "Survey paper with no statistical analysis or aggregated quantitative results."
     41       },
     42       "significance_tests": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "Survey paper with no comparative statistical claims requiring significance testing."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "Survey paper with no experiments producing effect sizes."
     51       },
     52       "sample_size_justified": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "Survey paper with no experimental sample sizes to justify."
     56       },
     57       "variance_reported": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "Survey paper with no experimental runs to report variance across."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The survey does not compare itself against prior surveys of MAS or LLM agents in a structured way. While it references other works like [17-20], it does not explicitly position its coverage or framework against prior survey contributions."
     68       },
     69       "baselines_contemporary": {
     70         "applies": false,
     71         "answer": false,
     72         "justification": "No experimental baselines are applicable to a survey paper."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "Survey paper with no system components to ablate."
     78       },
     79       "multiple_metrics": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "Survey paper with no experimental evaluation metrics."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation is not relevant to a survey paper's claims."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Survey paper with no test sets."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 1 provides a detailed per-work breakdown across multiple dimensions (object, modality, base model, training, feedback, evaluation, interaction). The paper also categorizes applications by domain in Table 2."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 5 (Discussion) addresses challenges including hallucination, bias, black-box effects, scaling issues, and dynamic environment adaptation. The paper discusses limitations of each generation strategy and technical approach throughout."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper discusses negative aspects: limitations of each approach (e.g., GAN latent vectors lack interpretability, captioning approach loses implicit visual information, LoRA performance may be inferior to full fine-tuning). Section 5 discusses open problems and challenges."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims to present a comprehensive survey with a five-component framework (profile, perception, self-action, mutual interaction, evolution), cover applications in problem-solving and world simulation, and discuss challenges. All these are substantiated in the paper body."
    115       },
    116       "causal_claims_justified": {
    117         "applies": false,
    118         "answer": false,
    119         "justification": "The paper is a survey/taxonomy and does not make causal claims about its own findings."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The abstract states MAS are 'a promising pathway towards realizing general artificial intelligence that is equivalent to or surpasses human-level intelligence' — a very broad claim not bounded by the evidence surveyed. The paper does not clearly state the boundaries of its coverage (search strategy, date range, inclusion criteria)."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": false,
    128         "answer": false,
    129         "justification": "As a survey/taxonomy paper presenting no empirical results of its own, alternative explanations are not applicable."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": false,
    133         "answer": false,
    134         "justification": "Theoretical/survey paper with no measurements of its own."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "Survey paper that does not run models. It reports model names used by surveyed works in Table 1."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "Survey paper that does not use prompting."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "Survey paper with no experiments requiring hyperparameter reporting."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "Survey paper with no agentic scaffolding of its own."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "The paper does not describe how papers were selected for inclusion. There is no description of search queries, databases searched, date ranges, or filtering criteria. The paper selection methodology is entirely undocumented."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations section. Section 5 discusses challenges of MAS in general but does not discuss limitations of the survey itself (coverage gaps, selection bias, etc.)."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity of the survey methodology are discussed. The paper acknowledges the field is 'in its nascent stages and rapidly evolving' but does not discuss specific threats to the survey's comprehensiveness or validity."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state scope boundaries — no inclusion/exclusion criteria, no date range, no search methodology. It broadly covers 'LLM-based multi-agent systems' without defining what falls outside scope."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data (list of all papers considered, extraction spreadsheets, coding decisions) is available for verification."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The paper does not describe how the surveyed papers were collected. There is no mention of databases searched, search terms used, or time period covered."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants; data source is academic literature. However, the paper selection process itself is undocumented."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "No data pipeline is documented. The paper jumps from 'we survey LLM-based MAS' to presenting categorized results without describing the intermediate selection and categorization process."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Funding section states: 'This work was partially supported by the National Natural Science Foundation of China (Ref. No.: 62372341) and the Fundamental Research Funds for the Central Universities (Ref. No.: 2042024kf0040).'"
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: Wuhan University and Zhejiang University. No conflicts with evaluated products."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Funding is from Chinese national science foundation and university research funds, which have no financial stake in the survey's conclusions."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "The competing interests section explicitly states: 'The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.'"
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this survey paper."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants. Paper states 'Ethics approval and consent to participate: Not applicable.'"
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this survey paper."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this survey paper."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this survey paper."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this survey paper."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this survey paper."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "Survey paper with no computational method of its own."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "Survey paper with no computational experiments."
    289       }
    290     },
    291     "survey_methodology": {
    292       "prisma_or_structured_protocol": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No PRISMA diagram, no structured search protocol, no reproducible search queries, no description of databases searched or date ranges. The paper collection methodology is entirely ad hoc."
    296       },
    297       "quality_assessment_of_sources": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The survey does not assess the methodological quality of any surveyed paper. All papers are treated equally regardless of their rigor. Table 1 catalogues properties but does not evaluate quality."
    301       },
    302       "publication_bias_discussed": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No discussion of publication bias, negative-result underrepresentation, or whether the surveyed literature skews toward positive results."
    306       }
    307     }
    308   },
    309   "claims": [
    310     {
    311       "claim": "LLM-based multi-agent systems can be decomposed into five key components: profile, perception, self-action, mutual interaction, and evolution.",
    312       "evidence": "Sections 3.1-3.5 systematically describe each component with references to dozens of representative works. Figure 1 provides a visual overview.",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "LLM-based MAS are 'a promising pathway towards realizing general artificial intelligence that is equivalent to or surpasses human-level intelligence.'",
    317       "evidence": "Abstract claim. No quantitative evidence or systematic analysis is provided to support this specific claim about AGI potential.",
    318       "supported": "unsupported"
    319     },
    320     {
    321       "claim": "Multi-agent interaction structures can be categorized into four types: hierarchical, decentralized, centralized, and shared memory.",
    322       "evidence": "Section 3.4.2 describes each type with representative examples (DyLAN for hierarchical, DMAS for decentralized, ACORM for centralized, MetaGPT for shared memory).",
    323       "supported": "moderate"
    324     },
    325     {
    326       "claim": "LLM-based MAS applications span problem-solving (software development, embodied agents, science experiments, debates) and world simulation (gaming, societal simulation, economics, recommender systems, disease propagation).",
    327       "evidence": "Table 2 and Section 4 provide representative works for each application domain with brief descriptions.",
    328       "supported": "moderate"
    329     }
    330   ],
    331   "red_flags": [
    332     {
    333       "flag": "No systematic review methodology",
    334       "detail": "The paper does not describe how papers were selected for inclusion. There are no search queries, databases, date ranges, or inclusion/exclusion criteria. This makes the survey non-reproducible and susceptible to selection bias."
    335     },
    336     {
    337       "flag": "No quality assessment of surveyed papers",
    338       "detail": "All surveyed works are presented without any assessment of their methodological quality. A survey that summarizes papers without evaluating their rigor launders the signal-to-noise ratio of its sources — weak and strong papers receive equal treatment."
    339     },
    340     {
    341       "flag": "Overclaiming about AGI potential",
    342       "detail": "The abstract claims MAS are 'a promising pathway towards realizing general artificial intelligence that is equivalent to or surpasses human-level intelligence' — an extraordinary claim not supported by the evidence reviewed."
    343     },
    344     {
    345       "flag": "Missing scope boundaries",
    346       "detail": "The survey does not define what falls inside or outside its scope. No temporal, methodological, or topical boundaries are stated, making it impossible to assess coverage completeness."
    347     }
    348   ],
    349   "cited_papers": [
    350     {
    351       "title": "ChatDev: Communicative agents for software development",
    352       "authors": ["C. Qian", "W. Liu", "H. Liu"],
    353       "year": 2024,
    354       "relevance": "Multi-agent system for software development using LLM-based communicative agents."
    355     },
    356     {
    357       "title": "MetaGPT: Meta programming for multi-agent collaborative framework",
    358       "authors": ["S. Hong", "X. Zheng", "J. Chen"],
    359       "year": 2024,
    360       "relevance": "Multi-agent collaborative framework with standardized operating procedures for software development."
    361     },
    362     {
    363       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation framework",
    364       "authors": ["Q. Wu", "G. Bansal", "J. Zhang"],
    365       "year": 2023,
    366       "arxiv_id": "2308.08155",
    367       "relevance": "Multi-agent conversation framework enabling hierarchical LLM agent interactions."
    368     },
    369     {
    370       "title": "A survey on large language model based autonomous agents",
    371       "authors": ["L. Wang", "C. Ma", "X. Feng"],
    372       "year": 2024,
    373       "relevance": "Comprehensive survey of LLM-based autonomous agents covering single-agent architectures."
    374     },
    375     {
    376       "title": "Generative agents: Interactive simulacra of human behavior",
    377       "authors": ["J.S. Park", "J. O'Brien", "C.J. Cai"],
    378       "year": 2023,
    379       "relevance": "Foundational work on generative agents simulating human behavior with memory and reflection."
    380     },
    381     {
    382       "title": "The rise and potential of large language model based agents: A survey",
    383       "authors": ["Z. Xi", "W. Chen", "X. Guo"],
    384       "year": 2023,
    385       "arxiv_id": "2309.07864",
    386       "relevance": "Survey of LLM-based agents covering capabilities, challenges, and potential applications."
    387     },
    388     {
    389       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    390       "authors": ["J. Wei", "X. Wang", "D. Schuurmans"],
    391       "year": 2022,
    392       "relevance": "Foundational prompting technique for LLM reasoning used extensively in agent systems."
    393     },
    394     {
    395       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    396       "authors": ["P. Lewis", "E. Perez", "A. Piktus"],
    397       "year": 2020,
    398       "relevance": "RAG framework widely used in LLM agent knowledge retrieval systems."
    399     },
    400     {
    401       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    402       "authors": ["S. Yao", "D. Yu", "J. Zhao"],
    403       "year": 2024,
    404       "relevance": "Multi-path reasoning approach for LLM agents enabling structured problem solving."
    405     },
    406     {
    407       "title": "Reflexion: Language agents with verbal reinforcement learning",
    408       "authors": ["N. Shinn", "F. Cassano", "A. Gopinath"],
    409       "year": 2024,
    410       "relevance": "Verbal reinforcement learning framework for agent self-improvement without weight updates."
    411     },
    412     {
    413       "title": "PENTESTGPT: An LLM-empowered automatic penetration testing tool",
    414       "authors": ["G. Deng", "Y. Liu", "V. Mayoral-Vilches"],
    415       "year": 2023,
    416       "arxiv_id": "2308.06782",
    417       "relevance": "LLM-based security testing agent relevant to AI safety and agentic workflows."
    418     },
    419     {
    420       "title": "Voyager: An open-ended embodied agent with large language models",
    421       "authors": ["G. Wang", "Y. Xie", "Y. Jiang"],
    422       "year": 2023,
    423       "arxiv_id": "2305.16291",
    424       "relevance": "Open-ended LLM agent for Minecraft demonstrating exploration and skill acquisition."
    425     }
    426   ]
    427 }

Impressum · Datenschutz