ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (19682B)


      1 {
      2   "paper": {
      3     "title": "A Survey on Large Language Model based Autonomous Agents",
      4     "authors": ["Lei Wang", "Chen Ma", "Xueyang Feng", "Zeyu Zhang", "Hao Yang", "Jingsen Zhang", "Zhi-Yuan Chen", "Jiakai Tang", "Xu Chen", "Yankai Lin", "Wayne Xin Zhao", "Zhewei Wei", "Ji-Rong Wen"],
      5     "year": 2023,
      6     "venue": "Frontiers of Computer Science",
      7     "arxiv_id": "2308.11432",
      8     "doi": "10.1007/s11704-024-40231-1"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["survey_methodology"],
     12   "methodology_tags": ["meta-analysis"],
     13   "key_findings": "This survey proposes a unified framework for LLM-based autonomous agents comprising profiling, memory, planning, and action modules. It categorizes agent capability acquisition into fine-tuning and non-fine-tuning strategies. The paper systematically reviews applications across social science, natural science, and engineering, and discusses evaluation strategies including subjective and objective approaches. Six key challenges are identified: role-playing capability, generalized human alignment, prompt robustness, hallucination, knowledge boundary, and efficiency.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code repository or analysis scripts are provided. The paper is a survey with no released artifacts."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No structured dataset of surveyed papers, extracted metadata, or analysis data is released."
     25       },
     26       "environment_specified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "This is a survey paper with no computational experiments requiring environment specification."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No instructions for reproducing the survey's paper collection, categorization, or analysis process."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "Survey paper with no statistical experiments."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "Survey paper with no statistical experiments."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "Survey paper with no statistical experiments."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "Survey paper with no statistical experiments."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "Survey paper with no statistical experiments."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The survey does not compare against prior surveys or reviews in a structured way. Section 5 mentions related surveys but does not systematically compare coverage, scope, or findings."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "Related surveys are briefly mentioned in Section 5 but not compared on coverage, methodology, or conclusions."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "Survey paper — no system with components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Survey paper — no experimental evaluation with metrics."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Survey paper — no system outputs to evaluate."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Survey paper — no test sets involved."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The survey provides detailed per-category breakdowns via taxonomies: agent architectures (profiling, memory, planning, action modules), applications (social science, natural science, engineering), and evaluation strategies (subjective, objective). Tables 1-3 organize papers by category."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6 discusses six challenges/limitations of LLM-based agents: role-playing capability, hallucination, knowledge boundary, prompt robustness, generalized human alignment, and efficiency."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper discusses limitations and challenges (Section 6) including 'hyper-accuracy distortion' in psychology experiments, harmful content generation, and hallucination problems."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims to present a comprehensive survey with a unified framework, overview of applications, and evaluation strategies — all of which are delivered in Sections 2-4."
    116       },
    117       "causal_claims_justified": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "The paper makes no causal claims; it is a descriptive survey organizing and categorizing existing work."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The survey claims comprehensiveness but covers papers primarily from 2021-August 2023 (Figure 1). This temporal boundary is shown in the figure but not explicitly discussed as a scope limitation. The title implies coverage of the entire field."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": false,
    129         "answer": false,
    130         "justification": "Pure survey/taxonomy paper with no empirical results requiring alternative explanations."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "Theoretical/survey paper with no measurements."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "Survey paper — no models used in experiments."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "Survey paper — no prompting used."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "Survey paper — no experiments with hyperparameters."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "Survey paper — no agentic scaffolding used."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper does not describe how papers were selected, what search queries were used, what databases were searched, or what inclusion/exclusion criteria were applied. There is no description of the paper collection methodology."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6 'Challenges' discusses limitations and open problems in the field, functioning as a limitations section."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "Section 6 discusses challenges of LLM-based agents as a field, but does not discuss threats to the validity of the survey itself — e.g., selection bias in paper collection, coverage gaps, or potential for missed work."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what is out of scope, what types of work were excluded, or what the temporal boundaries of the survey are (though Figure 1 implies coverage through August 2023)."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No list of all surveyed papers, search results, or categorization data is provided for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper does not describe how papers were collected — no search strategy, databases, queries, or time period are documented."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants; data source is existing literature. Standard benchmark NA applies."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No pipeline from paper discovery to inclusion/categorization is documented. The reader cannot know how the ~100 cited works were identified or whether important work was missed."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The Acknowledgement section lists funding: National Natural Science Foundation of China (No. 62102420), Beijing Outstanding Young Scientist Program, and several Renmin University initiatives."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are listed as affiliated with Gaoling School of Artificial Intelligence, Renmin University of China."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funders are government grants and university programs with no apparent stake in the survey's conclusions about LLM-based agents."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is included in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Survey paper — does not evaluate any pre-trained model on a benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Survey paper — does not evaluate any pre-trained model on a benchmark."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Survey paper — does not evaluate any pre-trained model on a benchmark."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this survey."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this survey."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this survey."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this survey."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this survey."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this survey."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this survey."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "Survey paper — no method with associated costs."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Survey paper — no computational experiments."
    290       }
    291     },
    292     "survey_methodology": {
    293       "prisma_or_structured_protocol": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No PRISMA diagram, no structured search protocol, no reproducible search queries. The paper appears to collect papers ad-hoc without a documented systematic methodology."
    297       },
    298       "quality_assessment_of_sources": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The survey treats all papers equally regardless of methodological quality. No quality scoring rubric or risk-of-bias assessment is applied to included studies."
    302       },
    303       "publication_bias_discussed": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No discussion of publication bias, no acknowledgment that the surveyed literature skews toward positive results, no funnel plots or bias tests."
    307       }
    308     }
    309   },
    310   "claims": [
    311     {
    312       "claim": "LLM-based autonomous agents can be understood through a unified framework of four modules: profiling, memory, planning, and action.",
    313       "evidence": "Section 2.1 and Figure 2 present the unified framework with detailed analysis of each module, mapping ~30 existing systems to this taxonomy in Table 1.",
    314       "supported": "moderate"
    315     },
    316     {
    317       "claim": "Agent capability acquisition strategies can be divided into fine-tuning-based and non-fine-tuning-based approaches.",
    318       "evidence": "Section 2.2 categorizes strategies and provides examples for each, including fine-tuning with human, LLM-generated, and real-world datasets, plus prompt and mechanism engineering.",
    319       "supported": "moderate"
    320     },
    321     {
    322       "claim": "LLM-based agents have significant applications across social science, natural science, and engineering.",
    323       "evidence": "Section 3 and Table 2 catalogue applications across these domains with specific examples for each sub-area.",
    324       "supported": "moderate"
    325     },
    326     {
    327       "claim": "This survey compiles 100 relevant works on LLM-based agents covering construction, applications, and evaluation.",
    328       "evidence": "Stated in Section 5; the reference list contains ~185 entries with extensive coverage of the space.",
    329       "supported": "strong"
    330     }
    331   ],
    332   "red_flags": [
    333     {
    334       "flag": "No systematic review methodology",
    335       "detail": "The survey does not describe how papers were identified, selected, or categorized. No search strategy, databases, queries, inclusion/exclusion criteria, or PRISMA-style flow are provided. The reader cannot assess completeness or reproduce the paper collection."
    336     },
    337     {
    338       "flag": "No quality assessment of surveyed papers",
    339       "detail": "All surveyed papers are treated equally regardless of methodological rigor. Claims from papers with weak evidence are presented alongside well-supported findings without distinction, potentially laundering weak results."
    340     },
    341     {
    342       "flag": "Temporal coverage not explicitly bounded",
    343       "detail": "Figure 1 shows coverage through August 2023, but no explicit temporal boundary is stated. The paper does not discuss what may have been missed or how rapidly the field evolved after the coverage period."
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "Generative agents: Interactive simulacra of human behavior",
    349       "authors": ["Joon Sung Park", "Joseph C. O'Brien", "Carrie J. Cai", "Meredith Ringel Morris", "Percy Liang", "Michael S. Bernstein"],
    350       "year": 2023,
    351       "relevance": "Foundational work on LLM-based agent simulation with memory, planning, and reflection modules."
    352     },
    353     {
    354       "title": "Communicative agents for software development",
    355       "authors": ["Chen Qian", "Xin Cong", "Cheng Yang"],
    356       "year": 2023,
    357       "arxiv_id": "2307.07924",
    358       "relevance": "ChatDev: multi-agent collaborative software development framework, key application of LLM agents in SE."
    359     },
    360     {
    361       "title": "MetaGPT: Meta programming for multi-agent collaborative framework",
    362       "authors": ["Sirui Hong", "Xiawu Zheng", "Jonathan Chen"],
    363       "year": 2023,
    364       "arxiv_id": "2308.00352",
    365       "relevance": "Multi-agent software development with role-based collaboration, relevant to agentic coding evaluation."
    366     },
    367     {
    368       "title": "Reflexion: Language agents with verbal reinforcement learning",
    369       "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath", "Karthik Narasimhan", "Shunyu Yao"],
    370       "year": 2023,
    371       "relevance": "Key agent architecture using verbal self-reflection for learning, influential in agent evaluation design."
    372     },
    373     {
    374       "title": "ReAct: Synergizing reasoning and acting in language models",
    375       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
    376       "year": 2023,
    377       "relevance": "Foundational reasoning+acting paradigm for LLM agents, widely used baseline in agent benchmarks."
    378     },
    379     {
    380       "title": "Voyager: An open-ended embodied agent with large language models",
    381       "authors": ["Guanzhi Wang", "Yuqi Xie", "Yunfan Jiang"],
    382       "year": 2023,
    383       "arxiv_id": "2305.16291",
    384       "relevance": "Open-ended agent with skill library and curriculum learning, key reference for agent capability acquisition."
    385     },
    386     {
    387       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    388       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    389       "year": 2022,
    390       "relevance": "Foundational prompting technique enabling agent planning capabilities."
    391     },
    392     {
    393       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    394       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"],
    395       "year": 2023,
    396       "relevance": "Multi-path reasoning framework for LLM planning, key component in agent architecture."
    397     },
    398     {
    399       "title": "AgentBench: Evaluating LLMs as agents",
    400       "authors": ["Xiao Liu", "Hao Yu", "Hanchen Zhang"],
    401       "year": 2023,
    402       "arxiv_id": "2308.03688",
    403       "relevance": "Comprehensive benchmark for evaluating LLM agent capabilities across diverse environments."
    404     },
    405     {
    406       "title": "Toolformer: Language models can teach themselves to use tools",
    407       "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì"],
    408       "year": 2023,
    409       "relevance": "Self-supervised tool learning for LLMs, foundational for tool-augmented agent design."
    410     },
    411     {
    412       "title": "HuggingGPT: Solving AI tasks with ChatGPT and its friends in Hugging Face",
    413       "authors": ["Yongliang Shen", "Kaitao Song", "Xu Tan"],
    414       "year": 2023,
    415       "relevance": "Task-planning agent leveraging external model ecosystem, key example of tool-use architecture."
    416     },
    417     {
    418       "title": "ToolLLM: Facilitating large language models to master 16000+ real-world APIs",
    419       "authors": ["Yujia Qin", "Shihao Liang", "Yining Ye"],
    420       "year": 2023,
    421       "arxiv_id": "2307.16789",
    422       "relevance": "Large-scale tool-use framework with fine-tuning and evaluation, relevant to agent capability benchmarking."
    423     }
    424   ]
    425 }

Impressum · Datenschutz