scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20288B)
      1 {
      2   "paper": {
      3     "title": "Exploring Large Language Model based Intelligent Agents: Definitions, Methods, and Prospects",
      4     "authors": [
      5       "Yuheng Cheng",
      6       "Ceyao Zhang",
      7       "Zhengwen Zhang",
      8       "Xiangrui Meng",
      9       "Sirui Hong",
     10       "Wenhao Li",
     11       "Zihao Wang",
     12       "Zekai Wang",
     13       "Feng Yin",
     14       "Junhua Zhao",
     15       "Xiuqiang He"
     16     ],
     17     "year": 2024,
     18     "venue": "arXiv",
     19     "arxiv_id": "2401.03428",
     20     "doi": "10.48550/arXiv.2401.03428"
     21   },
     22   "scan_version": 2,
     23   "active_modules": ["survey_methodology"],
     24   "methodology_tags": ["meta-analysis"],
     25   "key_findings": "This survey provides a comprehensive taxonomy of LLM-based agents covering single-agent and multi-agent systems, decomposing them into planning, memory, rethinking, environment, and action components. It catalogs over 80 specific systems across tables and categorizes multi-agent relationships as cooperative, competitive, mixed, or hierarchical with CPDE/DPDE planning types. The paper covers prospect applications across natural sciences, social sciences, engineering, and universal agents, but provides no quantitative analysis or quality assessment of the surveyed works.",
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No repository URL, code archive, or analysis scripts are provided in the paper."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No dataset of surveyed papers, extracted metadata, or analysis data is released."
     37       },
     38       "environment_specified": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "This is a survey paper with no computational experiments requiring environment specification."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No instructions are provided for reproducing the survey's paper selection or analysis process."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": false,
     52         "answer": false,
     53         "justification": "Survey paper with no statistical experiments."
     54       },
     55       "significance_tests": {
     56         "applies": false,
     57         "answer": false,
     58         "justification": "Survey paper with no statistical comparisons."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "Survey paper with no quantitative experiments."
     64       },
     65       "sample_size_justified": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "Survey paper; no statistical sample size to justify."
     69       },
     70       "variance_reported": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "Survey paper with no experimental runs."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "The survey does not compare itself against prior surveys of LLM-based agents or position itself relative to other survey efforts."
     81       },
     82       "baselines_contemporary": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No baselines are included so contemporaneity cannot be assessed."
     86       },
     87       "ablation_study": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "Survey paper with no system to ablate."
     91       },
     92       "multiple_metrics": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "Survey paper with no quantitative evaluation."
     96       },
     97       "human_evaluation": {
     98         "applies": false,
     99         "answer": false,
    100         "justification": "Survey paper; human evaluation of outputs is not relevant."
    101       },
    102       "held_out_test_set": {
    103         "applies": false,
    104         "answer": false,
    105         "justification": "Survey paper with no test set."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Tables 1 and 2 provide per-system breakdowns across multiple dimensions (field, training, data, evaluation, modality, feedback, tool, planning, review) for both single-agent and multi-agent systems."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 6.2 discusses challenges and limitations including intrinsic LLM constraints (context length, hallucinations), dynamic scaling issues, and security/trust concerns."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 6.2.1 discusses LLM limitations (hallucinations, context length constraints), Section 1.2 discusses RL-based agent limitations. The paper also notes challenges in each application domain."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The abstract claims to provide an 'in-depth overview of LLM-based intelligent agents within single-agent and multi-agent systems' covering definitions, frameworks, and components. The paper does deliver this through Sections 2-3 with detailed taxonomies and tables."
    128       },
    129       "causal_claims_justified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "The paper is a survey/taxonomy and does not make causal claims about system performance."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper makes broad claims about LLM-based agents' capabilities and prospects across many domains (natural sciences, social sciences, engineering, military) without bounding the scope of its survey methodology or acknowledging which areas it may have incomplete coverage of."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "This is a taxonomy/survey paper presenting no empirical results that require alternative explanations."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "Theoretical/survey paper with no measurements."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "Survey paper that does not run models."
    155       },
    156       "prompts_provided": {
    157         "applies": false,
    158         "answer": false,
    159         "justification": "Survey paper that does not use prompting."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "Survey paper with no experiments."
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "Survey paper with no agentic scaffolding used."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The paper does not describe how papers were selected for the survey. No search queries, databases searched, inclusion/exclusion criteria, or filtering pipeline are documented."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 6.2 'Challenges' discusses intrinsic constraints of LLMs (6.2.1), dynamic scaling (6.2.2), and security and trust (6.2.3). While titled 'Challenges' rather than 'Limitations,' it serves a similar function with substantive discussion."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Section 6.2 discusses challenges of LLM-based agents generally but does not discuss specific threats to the validity of this survey itself — no mention of selection bias, coverage gaps, or limitations of the review methodology."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper does not explicitly state what it excludes from scope. It covers an extremely broad range of domains without stating boundaries on what was NOT covered or what claims the survey is NOT making."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No raw data (paper lists, search results, extracted metadata) is made available for verification."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "The paper does not describe how the surveyed papers were found or collected. No search strategy, databases, or time period is mentioned."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants; data sources are published papers but the selection method is unstated (covered by data_collection_described)."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No pipeline from paper discovery to final inclusion is documented. The reader cannot reconstruct how the ~80+ systems in Tables 1-2 were identified."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No funding sources or acknowledgments section is present in the paper."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Author affiliations are listed: CUHK Shenzhen, DeepWisdom, Peking University, Yantu.ai, FiT Tencent. Notably, co-author Sirui Hong is from DeepWisdom (creators of MetaGPT), and MetaGPT is featured prominently in the survey."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No funding is disclosed, so independence cannot be assessed. Given affiliations with DeepWisdom (MetaGPT) and Tencent, potential conflicts exist but are not addressed."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests statement is present. Authors affiliated with DeepWisdom (MetaGPT) and Tencent may have financial interests in systems discussed."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "Survey paper that does not evaluate any model on benchmarks."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "Survey paper that does not evaluate any model on benchmarks."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "Survey paper that does not evaluate any model on benchmarks."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this survey."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this survey."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this survey."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this survey."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this survey."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this survey."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants in this survey."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": false,
    295         "answer": false,
    296         "justification": "Survey paper with no computational method to cost."
    297       },
    298       "compute_budget_stated": {
    299         "applies": false,
    300         "answer": false,
    301         "justification": "Survey paper with no computational experiments."
    302       }
    303     },
    304     "survey_methodology": {
    305       "prisma_or_structured_protocol": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "No PRISMA flow diagram, structured search strategy, or review protocol is described. The paper appears to use ad-hoc paper collection."
    309       },
    310       "quality_assessment_of_sources": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The survey treats all cited systems equally with no quality scoring, risk-of-bias assessment, or evaluation of the methodological rigor of included studies."
    314       },
    315       "publication_bias_discussed": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "No discussion of publication bias — the survey does not consider whether its sources skew toward positive results or whether negative results about LLM-based agents are underrepresented."
    319       }
    320     }
    321   },
    322   "claims": [
    323     {
    324       "claim": "LLM-based agents offer robust generalization capabilities across various applications compared to RL-based agents",
    325       "evidence": "Section 1.3 lists advantages: potent NLP and comprehensive knowledge, zero-shot/few-shot learning, organic human-computer interaction. No quantitative evidence provided.",
    326       "supported": "weak"
    327     },
    328     {
    329       "claim": "LLM-based agents can be categorized into single-agent and multi-agent systems with distinct component frameworks",
    330       "evidence": "Sections 2-3 provide detailed taxonomies with Tables 1 and 2 listing 34 single-agent and 11 multi-agent systems with structured attributes.",
    331       "supported": "moderate"
    332     },
    333     {
    334       "claim": "LLM-based agents have broad application prospects across natural sciences, social sciences, engineering systems, and universal autonomous agents",
    335       "evidence": "Section 5 discusses prospects across 7+ domains but most subsections describe potential future directions rather than demonstrated results.",
    336       "supported": "weak"
    337     }
    338   ],
    339   "red_flags": [
    340     {
    341       "flag": "No systematic review methodology",
    342       "detail": "The survey provides no description of how papers were selected, what databases were searched, what time period was covered, or what inclusion/exclusion criteria were used. This makes the survey unreproducible and potentially biased toward well-known or author-affiliated systems."
    343     },
    344     {
    345       "flag": "Potential conflict of interest — MetaGPT",
    346       "detail": "Co-author Sirui Hong is from DeepWisdom, the company behind MetaGPT. MetaGPT appears prominently in Tables 1-2 and is discussed multiple times throughout the paper. This conflict is not disclosed or acknowledged."
    347     },
    348     {
    349       "flag": "No quality assessment of surveyed work",
    350       "detail": "The survey catalogs systems without any assessment of their methodological rigor, reproducibility, or evidence quality. All systems are presented as equally valid contributions regardless of whether they have been evaluated, peer-reviewed, or reproduced."
    351     },
    352     {
    353       "flag": "Speculative prospect sections",
    354       "detail": "Section 5 (Prospect Applications) spans 13 pages across many domains (military, politics, climate, biology, etc.) consisting largely of speculation about what LLM-based agents 'could' do, with minimal grounding in demonstrated capabilities."
    355     },
    356     {
    357       "flag": "Claims outrun evidence",
    358       "detail": "The paper makes sweeping claims about LLM-based agents' potential across many domains (e.g., military strategy, drug discovery, climate simulation) without evidence that these applications have been demonstrated. Most prospect sections are aspirational rather than evidence-based."
    359     }
    360   ],
    361   "cited_papers": [
    362     {
    363       "title": "Voyager: An open-ended embodied agent with large language models",
    364       "authors": ["Guanzhi Wang", "Yuqi Xie", "Yunfan Jiang"],
    365       "year": 2023,
    366       "arxiv_id": "2305.16291",
    367       "relevance": "Key example of LLM-based game agent with lifelong learning in Minecraft, relevant to agent capability evaluation."
    368     },
    369     {
    370       "title": "MetaGPT: Meta programming for multi-agent collaborative framework",
    371       "authors": ["Sirui Hong", "Xiawu Zheng", "Jonathan Chen"],
    372       "year": 2023,
    373       "arxiv_id": "2308.00352",
    374       "relevance": "Multi-agent software development framework using SOPs, central to AI-assisted coding research."
    375     },
    376     {
    377       "title": "Communicative agents for software development (ChatDev)",
    378       "authors": ["Chen Qian", "Xin Cong", "Cheng Yang"],
    379       "year": 2023,
    380       "arxiv_id": "2307.07924",
    381       "relevance": "Virtual chat-driven software company using multi-agent collaboration for end-to-end development."
    382     },
    383     {
    384       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation framework",
    385       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    386       "year": 2023,
    387       "arxiv_id": "2308.08155",
    388       "relevance": "Multi-agent framework for task decomposition through dialogue, key infrastructure for agentic AI."
    389     },
    390     {
    391       "title": "ToolLLM: Facilitating large language models to master 16000+ real-world APIs",
    392       "authors": ["Yujia Qin", "Shihao Liang", "Yining Ye"],
    393       "year": 2023,
    394       "arxiv_id": "2307.16789",
    395       "relevance": "Framework for tool use evaluation and instruction tuning, foundational for agent benchmarking."
    396     },
    397     {
    398       "title": "AgentBench: Evaluating LLMs as agents",
    399       "authors": ["Xiao Liu", "Hao Yu", "Hanchen Zhang"],
    400       "year": 2023,
    401       "arxiv_id": "2308.03688",
    402       "relevance": "Comprehensive benchmark for evaluating LLM agent capabilities across multiple environments."
    403     },
    404     {
    405       "title": "ReAct: Synergizing reasoning and acting in language models",
    406       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
    407       "year": 2022,
    408       "arxiv_id": "2210.03629",
    409       "relevance": "Foundational reasoning-action framework for LLM agents, widely adopted in agentic AI."
    410     },
    411     {
    412       "title": "Reflexion: Language agents with verbal reinforcement learning",
    413       "authors": ["Noah Shinn", "Federico Cassano", "Beck Labash"],
    414       "year": 2023,
    415       "arxiv_id": "2303.11366",
    416       "relevance": "Self-reflection mechanism for agent improvement, key rethinking capability for LLM agents."
    417     },
    418     {
    419       "title": "Generative agents: Interactive simulacra of human behavior",
    420       "authors": ["Joon Sung Park", "Joseph C O'Brien", "Carrie J Cai"],
    421       "year": 2023,
    422       "arxiv_id": "2304.03442",
    423       "relevance": "Influential multi-agent social simulation framework demonstrating emergent behavior."
    424     },
    425     {
    426       "title": "Toolformer: Language models can teach themselves to use tools",
    427       "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì"],
    428       "year": 2023,
    429       "arxiv_id": "2302.04761",
    430       "relevance": "Foundational work on self-taught tool use in LLMs, enabling agent action capabilities."
    431     },
    432     {
    433       "title": "Identifying the risks of LM agents with an LM-emulated sandbox (ToolEmu)",
    434       "authors": ["Yangjun Ruan", "Honghua Dong", "Andrew Wang"],
    435       "year": 2023,
    436       "relevance": "Safety evaluation framework for LLM agents using simulated tool execution."
    437     },
    438     {
    439       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    440       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"],
    441       "year": 2023,
    442       "arxiv_id": "2305.10601",
    443       "relevance": "Advanced planning method for LLM agents using tree-structured reasoning exploration."
    444     }
    445   ]
    446 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs