ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (20274B)


      1 {
      2   "paper": {
      3     "title": "The Rise and Potential of Large Language Model Based Agents: A Survey",
      4     "authors": ["Zhiheng Xi", "Wenxiang Chen", "Xin Guo", "Wei He", "Yiwen Ding", "Boyang Hong", "Ming Zhang", "Junzhe Wang", "Senjie Jin", "Enyu Zhou", "Rui Zheng", "Xiaoran Fan", "Xiao Wang", "Limao Xiong", "Yuhao Zhou", "Weiran Wang", "Changhao Jiang", "Yicheng Zou", "Xiangyang Liu", "Zhangyue Yin", "Shihan Dou", "Rongxiang Weng", "Wensen Cheng", "Qi Zhang", "Wenjuan Qin", "Yongyan Zheng", "Xipeng Qiu", "Xuanjing Huang", "Tao Gui"],
      5     "year": 2023,
      6     "venue": "arXiv",
      7     "arxiv_id": "2309.07864",
      8     "doi": "10.48550/arXiv.2309.07864"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["survey_methodology"],
     12   "methodology_tags": ["meta-analysis"],
     13   "key_findings": "This survey presents a comprehensive framework for LLM-based agents comprising brain, perception, and action components. It categorizes applications into single-agent, multi-agent, and human-agent cooperation scenarios, and explores agent societies including emergent social behaviors and personality traits. The paper identifies open problems including scaling agent numbers, bridging virtual-to-physical environments, and the debate over LLM-based agents as a path to AGI.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper provides a GitHub repository for related papers: https://github.com/WooooDyy/LLM-Agent-Paper-List, mentioned in the abstract."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No structured dataset of the surveyed papers (e.g., extracted metadata, classification data) is released. Only a curated paper list repository is provided."
     25       },
     26       "environment_specified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "This is a survey paper with no computational experiments requiring environment specification."
     30       },
     31       "reproduction_instructions": {
     32         "applies": false,
     33         "answer": false,
     34         "justification": "This is a survey paper with no experiments to reproduce."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "Survey paper with no experiments or statistical analyses."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "Survey paper with no experiments or statistical analyses."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "Survey paper with no experiments or statistical analyses."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "Survey paper with no experiments."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "Survey paper with no experiments."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The survey does not compare itself against prior surveys or reviews of LLM-based agents in a structured way."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No structured comparison with prior surveys is provided."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "Survey paper with no system components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Survey paper with no quantitative evaluation."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Survey paper; human evaluation of the survey's quality is not applicable."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Survey paper with no experiments."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The survey provides detailed typological breakdowns across categories: brain/perception/action components (§3), single/multi/human-agent applications (§4), agent society dimensions (§5), with figures showing detailed taxonomies (Figures 3-6, 11)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6 discusses limitations and challenges of LLM-based agents including adversarial robustness issues, hallucinations, challenges of scaling, and risks. Section 6.3 covers security and trustworthiness concerns."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper discusses limitations of various approaches: RL-based agents' sample inefficiency (§2.2), negative transfer in transfer learning (§2.2), hallucination problems (§3.1.2), multi-agent debate converging to incorrect consensus (§4.2.2), and ethical risks (§5.3.3)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims to provide a comprehensive survey covering agent construction, applications, and agent society, which the paper delivers across its seven sections."
    116       },
    117       "causal_claims_justified": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "The paper is a literature survey and does not make causal claims based on its own experiments."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper makes broad claims about LLM-based agents as 'potential sparks for AGI' and their suitability as agent foundations without bounding these claims to specific models or contexts tested."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": false,
    129         "answer": false,
    130         "justification": "As a pure survey/taxonomy, there are no empirical results requiring alternative explanations."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "Theoretical/survey paper with no measurements."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "Survey paper that does not run any models."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "Survey paper that does not use prompting."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "Survey paper with no experiments."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "Survey paper with no agentic scaffolding used."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper does not describe how papers were selected for inclusion in the survey. There is no description of search queries, databases used, inclusion/exclusion criteria, or filtering process. Papers appear to have been collected ad-hoc."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section discussing the limitations of the survey itself. Section 6 discusses limitations of LLM-based agents as a technology, not limitations of the survey's methodology."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity of the survey are discussed. The paper does not acknowledge potential biases in paper selection, coverage gaps, or methodological limitations of the review."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what is excluded from its scope. It broadly covers 'LLM-based agents' without defining specific boundaries for inclusion or exclusion of topics."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No structured dataset of surveyed papers with metadata is available. Only a curated paper list on GitHub."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper does not describe how the surveyed papers were collected — no search strategy, databases, or time period are specified."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants; data source is published literature (not a standard benchmark)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No documentation of the paper selection pipeline, filtering steps, or how the final set of surveyed papers was determined."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information or acknowledgments section listing grants is present in the paper. Only an acknowledgment to a professor for ethics review and an artist for Figure 1."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are listed as being from 'Fudan NLP Group' with corresponding author emails at m.fudan.edu.cn and fudan.edu.cn."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information is disclosed, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is provided."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Survey paper that does not evaluate any model on a benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Survey paper that does not evaluate any model on a benchmark."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Survey paper that does not evaluate any model on a benchmark."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this survey paper."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this survey paper."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this survey paper."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this survey paper."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this survey paper."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this survey paper."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this survey paper."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "Survey paper with no computational method of its own."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Survey paper with no computational experiments."
    290       }
    291     },
    292     "survey_methodology": {
    293       "prisma_or_structured_protocol": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No PRISMA diagram, no structured search strategy, no reproducible search queries, and no explicit review protocol. The paper collection appears ad-hoc."
    297       },
    298       "quality_assessment_of_sources": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The survey does not assess the methodological quality of the papers it reviews. All cited papers are treated equally regardless of their rigor, conflating well-controlled studies with blog-post-level demonstrations."
    302       },
    303       "publication_bias_discussed": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No discussion of publication bias. The survey does not consider whether its sources are biased toward positive results or whether negative findings about LLM agents are underrepresented."
    307       }
    308     }
    309   },
    310   "claims": [
    311     {
    312       "claim": "LLMs are suitable foundations for building AI agents due to their autonomy, reactivity, pro-activeness, and social ability.",
    313       "evidence": "Section 2.3 argues this based on properties like language generation capability, in-context learning, and reasoning abilities, citing GPT-4, AutoGPT, and various prompting studies.",
    314       "supported": "moderate"
    315     },
    316     {
    317       "claim": "LLM-based agents can exhibit reasoning and planning abilities comparable to symbolic agents through techniques like Chain-of-Thought and problem decomposition.",
    318       "evidence": "Section 2.2 references CoT (Wei et al.), problem decomposition methods, and various planning approaches. Evidence is drawn from cited works rather than original experiments.",
    319       "supported": "moderate"
    320     },
    321     {
    322       "claim": "Multiple LLM-based agents coexisting can lead to the emergence of social phenomena.",
    323       "evidence": "Section 5 cites Generative Agents (Park et al., 2023) and other simulation studies. The claim is supported by references to existing work but no original experiments.",
    324       "supported": "moderate"
    325     },
    326     {
    327       "claim": "Cooperative multi-agent systems can improve task efficiency and response quality compared to single agents.",
    328       "evidence": "Section 4.2 cites ChatDev, MetaGPT, CAMEL, and AgentVerse as examples. Evidence is from cited works.",
    329       "supported": "moderate"
    330     }
    331   ],
    332   "red_flags": [
    333     {
    334       "flag": "No systematic review methodology",
    335       "detail": "The survey provides no description of how papers were selected, what databases were searched, what search terms were used, or what inclusion/exclusion criteria were applied. This makes the coverage non-reproducible and potentially biased toward the authors' existing knowledge."
    336     },
    337     {
    338       "flag": "No quality assessment of surveyed papers",
    339       "detail": "All cited works are presented without any assessment of their methodological quality. Demo-level projects (AutoGPT, BabyAGI) are presented alongside peer-reviewed research without distinction, potentially laundering weak evidence."
    340     },
    341     {
    342       "flag": "Uncritical framing of LLMs as AGI sparks",
    343       "detail": "The paper adopts the 'sparks of AGI' framing from Bubeck et al. without critical examination, and repeatedly uses aspirational language about LLM agents achieving human-level capabilities. Section 6.5 presents the AGI debate but the overall tone is promotional."
    344     },
    345     {
    346       "flag": "Scope boundaries undefined",
    347       "detail": "The survey covers an extremely broad scope (philosophy, agent construction, applications, society simulation, ethics, AGI debate) without clearly defining what is in or out of scope, making completeness impossible to assess."
    348     }
    349   ],
    350   "cited_papers": [
    351     {
    352       "title": "Generative agents: Interactive simulacra of human behavior",
    353       "authors": ["Joon Sung Park", "Joseph C. O'Brien", "Carrie J. Cai"],
    354       "year": 2023,
    355       "arxiv_id": "2304.03442",
    356       "relevance": "Seminal work on LLM-based agent societies with emergent social behaviors, directly relevant to agentic AI capabilities."
    357     },
    358     {
    359       "title": "Voyager: An open-ended embodied agent with large language models",
    360       "authors": ["Guanzhi Wang", "Yuqi Xie", "Yunfan Jiang"],
    361       "year": 2023,
    362       "arxiv_id": "2305.16291",
    363       "relevance": "First LLM-based embodied lifelong learning agent, demonstrating autonomous skill acquisition in Minecraft."
    364     },
    365     {
    366       "title": "Communicative agents for software development",
    367       "authors": ["Chen Qian", "Xin Cong", "Cheng Yang"],
    368       "year": 2023,
    369       "arxiv_id": "2307.07924",
    370       "relevance": "ChatDev multi-agent software development system, key example of agentic AI for code generation."
    371     },
    372     {
    373       "title": "MetaGPT: Meta programming for multi-agent collaborative framework",
    374       "authors": ["Sirui Hong", "Xiawu Zheng", "Jonathan Chen"],
    375       "year": 2023,
    376       "arxiv_id": "2308.00352",
    377       "relevance": "Multi-agent framework inspired by software engineering workflows, relevant to AI-assisted programming."
    378     },
    379     {
    380       "title": "Tool learning with foundation models",
    381       "authors": ["Yujia Qin", "Shengding Hu", "Yankai Lin"],
    382       "year": 2023,
    383       "arxiv_id": "2304.08354",
    384       "relevance": "Comprehensive study of tool use in LLM-based agents, core capability for agentic workflows."
    385     },
    386     {
    387       "title": "AgentBench: Evaluating LLMs as agents",
    388       "authors": ["Xiao Liu", "Hao Yu", "Hanchen Zhang"],
    389       "year": 2023,
    390       "arxiv_id": "2308.03688",
    391       "relevance": "Benchmark for evaluating LLM agent capabilities across diverse real-world scenarios."
    392     },
    393     {
    394       "title": "Autogen: Enabling next-gen LLM applications via multi-agent conversation framework",
    395       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    396       "year": 2023,
    397       "arxiv_id": "2308.08155",
    398       "relevance": "Multi-agent conversation framework enabling cooperative LLM agent interactions."
    399     },
    400     {
    401       "title": "CAMEL: communicative agents for 'mind' exploration of large scale language model society",
    402       "authors": ["Guohao Li", "Hasan Abed Al Kader Hammoud", "Hani Itani"],
    403       "year": 2023,
    404       "arxiv_id": "2303.17760",
    405       "relevance": "Role-playing multi-agent communication framework for exploring LLM agent cooperation."
    406     },
    407     {
    408       "title": "Toolformer: Language models can teach themselves to use tools",
    409       "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì"],
    410       "year": 2023,
    411       "arxiv_id": "2302.04761",
    412       "relevance": "Self-supervised tool learning for LLMs, foundational work on agent tool use capabilities."
    413     },
    414     {
    415       "title": "ReAct: Synergizing reasoning and acting in language models",
    416       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
    417       "year": 2023,
    418       "relevance": "Key framework combining reasoning and acting in LLM agents, widely used in agentic systems."
    419     },
    420     {
    421       "title": "Reflexion: Language agents with verbal reinforcement learning",
    422       "authors": ["Noah Shinn", "Federico Cassano", "Beck Labash"],
    423       "year": 2023,
    424       "arxiv_id": "2303.11366",
    425       "relevance": "Self-reflection mechanism for LLM agents enabling learning from failures."
    426     },
    427     {
    428       "title": "Constitutional AI: harmlessness from AI feedback",
    429       "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu"],
    430       "year": 2022,
    431       "arxiv_id": "2212.08073",
    432       "relevance": "AI safety alignment approach relevant to ensuring agent safety and trustworthiness."
    433     }
    434   ]
    435 }

Impressum · Datenschutz