scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27580B)
      1 {
      2   "paper": {
      3     "title": "ChatDev: Communicative Agents for Software Development",
      4     "authors": [
      5       "Chen Qian",
      6       "Wei Liu",
      7       "Hongzhang Liu",
      8       "Nuo Chen",
      9       "Yufan Dang",
     10       "Jiahao Li",
     11       "Cheng Yang",
     12       "Weize Chen",
     13       "Yusheng Su",
     14       "Xin Cong",
     15       "Juyuan Xu",
     16       "Dahai Li",
     17       "Zhiyuan Liu",
     18       "Maosong Sun"
     19     ],
     20     "year": 2023,
     21     "venue": "ACL 2024",
     22     "arxiv_id": "2307.07924"
     23   },
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The abstract states 'The code and data are available at https://github.com/OpenBMB/ChatDev.' A working GitHub URL is provided."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The abstract states code and data are available at the GitHub repository. They also describe the SRDD dataset (1,200 software requirement prompts) and indicate it is released."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper mentions 'Python-3.11.4' and 'ChatGPT-3.5 with a temperature of 0.2' in the implementation details, but no requirements.txt, Dockerfile, or detailed environment setup with library versions is provided in the paper itself."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided in the paper. The implementation details section describes the setup at a high level (5 subtasks, 3 phases, roles, termination conditions) but does not provide specific commands or a guide to replicate experiments."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Table 1 reports point estimates only (e.g., 0.5600 completeness, 0.8800 executability) with no confidence intervals or error bars. The dagger symbol indicates p<=0.05 significance tests but no uncertainty ranges are given."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Table 1 uses the dagger symbol (†) to indicate 'significant statistical differences (p<=0.05) between a baseline and ours.' Statistical significance testing was performed."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper reports raw metric values (e.g., Quality from 0.1523 to 0.3953) which provides context for magnitude, but no formal effect size measures (Cohen's d, odds ratios, etc.) are reported. The raw differences are stated but not contextualized as effect sizes."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The SRDD dataset has 1,200 prompts but the evaluation appears to use a subset (Table 2 mentions percentages suggesting a sample, and the paper does not justify the size). No power analysis or sample size justification is provided."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. All results in Tables 1-4 are single point estimates without any indication of variability across runs."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Two baselines are included: GPT-Engineer (single-agent) and MetaGPT (multi-agent). Both are described in the Baselines section and results are compared in Tables 1 and 2."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Both GPT-Engineer (Osika, 2023) and MetaGPT (Hong et al., 2023) were contemporary at the time of submission — all three systems are from 2023 and represent the state of the art for LLM-based software development at that time."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Table 4 presents an ablation study removing individual phases (halting after coding, complete, review, testing), removing communicative dehallucination (\\CDH), and removing roles (\\Roles). Each variant's impact on all four metrics is measured."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Four metrics are used: Completeness, Executability, Consistency, and Quality (the product of the first three). Additionally, pairwise comparison with both GPT-4 and human evaluators is reported in Table 2."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 2 reports pairwise evaluation by both GPT-4 and human evaluators. The paper states 'human experts independently assessed the task solutions, randomized to prevent order bias.' Human evaluation of system outputs is included."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "There is no mention of a train/dev/test split. The SRDD dataset is described as containing 1,200 prompts evaluated directly. It is unclear whether any portion was used for tuning vs. final evaluation, and there is no explicit held-out test set."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "The SRDD dataset has 5 main categories (Education, Work, Life, Game, Creation) and 40 subcategories, but no per-category performance breakdowns are provided. Only overall averages are reported in Tables 1-4."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 4.3 and Figures 4-5 provide detailed analysis of failure types: 'Method Not Implemented' (34.85% of review issues), 'ModuleNotFound' (45.76% of testing errors), and various other error categories. The Limitations section also discusses where agents fail (vague requirements, simple logic)."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The ablation study in Table 4 shows that removing testing (<=Testing) yields identical results to the full system on some metrics, suggesting testing has no measurable impact on completeness or consistency. The Limitations section candidly notes agents often implement 'simple logic' with 'low information density.'"
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims ChatDev 'notably improves the quality of software, leading to improved completeness, executability, and better consistency with requirements.' Table 1 supports these claims with improvements over both baselines across all four metrics. The abstract also claims natural language aids system design and programming language aids debugging, which is supported by the communication analysis in Section 4.3."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper makes causal claims through ablation studies (Table 4): removing communicative dehallucination decreases all metrics, removing roles decreases quality substantially. These are controlled single-variable manipulations. The claim 'communicative agents guide each subtask towards integrated and automated solutions' is supported by the ablation design."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The title and framing ('Communicative Agents for Software Development') are broad, but the system was tested only with ChatGPT-3.5 on relatively simple software tasks (games, basic applications). The Limitations section acknowledges these 'are more suitable for prototype systems rather than complex real-world applications,' but the title and abstract do not bound claims to simple/prototype software."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, it does not consider whether the improvements over MetaGPT might be due to differences in prompt engineering rather than the communication paradigm, or whether the metrics themselves favor ChatDev's output style. The Limitations section discusses weaknesses but not alternative explanations for the positive results."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper states 'We used ChatGPT-3.5' with no specific version identifier (e.g., gpt-3.5-turbo-0613). 'ChatGPT-3.5' is a marketing name without a snapshot date or API version. GPT-4 is mentioned for pairwise evaluation but also without version specifics."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper describes the prompt structure at a high level (system prompts PI and PA covering overview, objectives, roles, protocols, constraints) but does not provide the actual prompt text used. Equations 1-7 formalize the communication pattern abstractly, but the concrete prompts given to agents are not included."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The Implementation Details section states: 'temperature of 0.2', termination after 'two unchanged code modifications or after 10 rounds of communication.' Key hyperparameters for the LLM API and the multi-agent system are reported."
    158       },
    159       "scaffolding_described": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The agentic scaffolding is described in detail across Section 3: chat chain workflow (Figure 2), role assignment (Agentization subsection), memory management (short-term and long-term memory with formal definitions), communicative dehallucination mechanism (Section 3.2), and termination conditions. The multi-agent architecture is the paper's main contribution."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "The SRDD dataset creation is described briefly: 'we utilize existing software descriptions as initial examples, which are then further developed through a process that combines LLM-based automatic generation with post-processing refinement guided by humans.' However, the specific filtering criteria, how many examples were generated vs. retained, and the human refinement process are not documented in detail."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 6 is titled 'Limitations' and provides substantial discussion across three main points: overestimation of agent capabilities, evaluation challenges, and computational cost."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The Limitations section makes specific points: agents 'often implement simple logic, resulting in low information density,' the evaluation metrics miss 'functionalities, robustness, safety, and user-friendliness,' and the system is described as 'more suitable for prototype systems rather than complex real-world applications.' These are specific to this study."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The Limitations section explicitly states: 'Currently, these technologies are more suitable for prototype systems rather than complex real-world applications.' It also specifies that evaluation covers completeness, executability, consistency, and quality, but 'future research should consider additional factors such as functionalities, robustness, safety, and user-friendliness.'"
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The paper states code and data are available at https://github.com/OpenBMB/ChatDev. The dataset (SRDD) and generated outputs are described as released, which would allow independent verification."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The Datasets section describes SRDD construction: initial examples from Li et al. (2023a), LLM-based automatic generation with human post-processing refinement, categories from popular platforms, 1,200 prompts across 5 areas and 40 subcategories with 30 prompts each."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "Human evaluators are mentioned for pairwise comparison (Table 2) and human post-processing refinement of the dataset, but no details are provided about who these humans were, how they were recruited, or their qualifications. The footnote only says 'human experts independently assessed the task solutions.'"
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "The dataset creation pipeline is described at a high level (existing descriptions -> LLM generation -> human refinement -> 1,200 prompts) but the specific filtering at each stage, how many examples were generated before filtering, and what the human refinement entailed are not documented."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The Acknowledgments section states: 'The work was supported by the National Key R&D Program of China (No.2022ZD0116312), the Postdoctoral Fellowship Program of CPSF under Grant Number GZB20230348, and Tencent Rhino-Bird Focused Research Program.'"
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly listed: Tsinghua University, The University of Sydney, BUPT, and Modelbest Inc. One author (Dahai Li) is affiliated with Modelbest Inc., a commercial entity."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Funding comes from the Chinese National Key R&D Program, CPSF (postdoctoral fellowship), and Tencent Rhino-Bird (research program). None of these funders produce or sell the ChatDev system, and the system uses OpenAI's ChatGPT rather than a funder's product. The funders appear independent of the outcome."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "There is no competing interests or financial interests statement. One author is from Modelbest Inc. (a commercial entity), but no declaration of financial interests or conflicts is provided."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The paper uses ChatGPT-3.5 but does not state the model's training data cutoff date. This is relevant because the evaluation uses custom software prompts that could potentially overlap with training data patterns."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether any of the 1,200 SRDD prompts or similar software requirements appeared in GPT-3.5's training data. The dataset was partly derived from 'existing software descriptions' which could be in the training set."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "The SRDD dataset is constructed from existing descriptions augmented with LLM generation. No discussion of whether these existing descriptions were in GPT-3.5's training data. Since the baseline descriptions come from 'popular platforms such as Ubuntu, Google Play, Microsoft Store, and Apple Store,' contamination risk is real and unaddressed."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "The paper does not involve human participants as subjects. Human evaluators are used for pairwise comparison but this is evaluation of system outputs, not a human subjects study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human subjects study — human evaluators assessed system outputs but were not subjects of the research."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human subjects study. The human evaluators are called 'human experts' but their demographics are not the focus since this is not a human subjects study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human subjects study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human subjects study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human subjects study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human subjects study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Table 3 reports Duration (148.2148 seconds for ChatDev) and #Tokens consumed (22,949.4450 per task). While direct API costs in dollars are not stated, the token consumption and wall-clock time provide meaningful cost proxies."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Table 3 reports per-task statistics but the total computational budget for the full evaluation (e.g., total tokens across all 1,200 tasks, total API spend, total wall-clock time for the full experiment) is not stated."
    295       }
    296     }
    297   },
    298   "claims": [
    299     {
    300       "claim": "ChatDev significantly outperforms both GPT-Engineer and MetaGPT across all four metrics (Completeness, Executability, Consistency, Quality).",
    301       "evidence": "Table 1: ChatDev achieves 0.5600 completeness, 0.8800 executability, 0.8021 consistency, 0.3953 quality vs. GPT-Engineer (0.5022, 0.3583, 0.7887, 0.1419) and MetaGPT (0.4834, 0.4145, 0.7601, 0.1523). Statistical significance indicated by dagger at p<=0.05.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "ChatDev is preferred over both baselines by human evaluators in pairwise comparison, with 90.16% win rate over GPT-Engineer and 88.00% win rate over MetaGPT.",
    306       "evidence": "Table 2 reports pairwise evaluation results with both GPT-4 and human evaluators. Human evaluators preferred ChatDev in 90.16% of comparisons against GPT-Engineer and 88.00% against MetaGPT.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "The communicative dehallucination mechanism reduces coding hallucinations and improves all metrics.",
    311       "evidence": "Table 4 ablation: removing communicative dehallucination (\\CDH) drops Quality from 0.3953 to 0.3094, Completeness from 0.5600 to 0.4700, and Executability from 0.8800 to 0.8400.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "Role assignment is the most impactful component, with its removal causing the largest performance degradation.",
    316       "evidence": "Table 4: removing roles (\\Roles) drops Quality from 0.3953 to 0.2212, Executability from 0.8800 to 0.5800. Section 4.2 states 'the most substantial impact on performance occurs when the roles of all agents are removed.'",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "Natural language communication is advantageous for system design, while programming language communication drives software optimization.",
    321       "evidence": "Section 4.3 and Figure 3: design phase is 57.20% natural language, post-design phases show more programming language. The paper observes that agents autonomously discuss design aspects (target user, data management, UI) in natural language, while code review/testing in programming language drives bug fixes.",
    322       "supported": "weak"
    323     }
    324   ],
    325   "methodology_tags": [
    326     "benchmark-eval",
    327     "case-study"
    328   ],
    329   "key_findings": "ChatDev introduces a multi-agent software development framework using LLM-powered agents organized in a chat chain with communicative dehallucination to reduce coding hallucinations. On their custom SRDD dataset of 1,200 software prompts, ChatDev achieves 88% executability compared to GPT-Engineer's 36% and MetaGPT's 41%, with human evaluators preferring ChatDev's output in approximately 90% of pairwise comparisons. Ablation studies show that role assignment and communicative dehallucination are both critical components, with role removal causing the largest quality degradation (from 0.3953 to 0.2212).",
    330   "red_flags": [
    331     {
    332       "flag": "Custom non-standard benchmark",
    333       "detail": "The SRDD dataset was created by the authors using LLM generation + human refinement, and evaluation metrics (Completeness, Executability, Consistency as cosine similarity) are non-standard. No established benchmarks (e.g., HumanEval, SWE-bench) are used, making cross-study comparison impossible."
    334     },
    335     {
    336       "flag": "No variance or reproducibility information",
    337       "detail": "All results are single point estimates with no standard deviations, error bars, or indication of variability across runs. LLM outputs are stochastic even at temperature 0.2, so run-to-run variation is expected but unreported."
    338     },
    339     {
    340       "flag": "Consistency metric relies on embedding similarity",
    341       "detail": "The Consistency metric uses cosine distance between semantic embeddings of requirements and generated code, which is a weak proxy for whether the software actually meets its requirements. A comment-heavy program might score high on embedding similarity without actually being functional."
    342     },
    343     {
    344       "flag": "Per-category results suppressed",
    345       "detail": "The dataset has 5 categories and 40 subcategories but only aggregate results are reported. Performance might vary dramatically across task types (e.g., games vs. data management), but this information is hidden by averaging."
    346     },
    347     {
    348       "flag": "Human evaluation details absent",
    349       "detail": "The pairwise human evaluation reports win rates but provides no details about the number of evaluators, their expertise, inter-rater agreement, or the evaluation protocol beyond 'human experts independently assessed the task solutions, randomized to prevent order bias.'"
    350     },
    351     {
    352       "flag": "Model version unspecified",
    353       "detail": "Using 'ChatGPT-3.5' without a specific version (e.g., gpt-3.5-turbo-0613) means results cannot be exactly reproduced, as the model was updated multiple times during 2023."
    354     }
    355   ],
    356   "cited_papers": [
    357     {
    358       "title": "MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework",
    359       "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"],
    360       "year": 2023,
    361       "relevance": "Direct baseline for multi-agent software development comparison; key competitor in the LLM-based software engineering space."
    362     },
    363     {
    364       "title": "CAMEL: Communicative Agents for 'Mind' Exploration of Large Scale Language Model Society",
    365       "authors": ["Guohao Li", "Hasan Abed Al Kader Hammoud", "Hani Itani"],
    366       "year": 2023,
    367       "relevance": "Foundational multi-agent communication framework that inspired ChatDev's dual-agent communication design."
    368     },
    369     {
    370       "title": "AgentVerse: Facilitating Multi-agent Collaboration and Exploring Emergent Behaviors in Agents",
    371       "authors": ["Weize Chen", "Yusheng Su", "Jingwei Zuo"],
    372       "year": 2023,
    373       "relevance": "Multi-agent collaboration framework relevant to understanding agent coordination in software development."
    374     },
    375     {
    376       "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    377       "authors": ["Joon Sung Park", "Joseph O'Brien", "Carrie Jun Cai"],
    378       "year": 2023,
    379       "relevance": "Pioneering work on LLM-based agents with memory and role-playing, foundational to agentic AI research."
    380     },
    381     {
    382       "title": "Voyager: An Open-ended Embodied Agent with Large Language Models",
    383       "authors": ["Guanzhi Wang", "Yuqi Xie", "Yunfan Jiang"],
    384       "year": 2023,
    385       "relevance": "Open-ended LLM agent with memory and skill library, relevant to understanding autonomous agent capabilities."
    386     },
    387     {
    388       "title": "Evaluating Large Language Models Trained on Code",
    389       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    390       "year": 2021,
    391       "arxiv_id": "2107.03374",
    392       "relevance": "Introduced HumanEval benchmark and Codex; foundational work on LLM code generation evaluation."
    393     },
    394     {
    395       "title": "ToolLLM: Facilitating Large Language Models to Master 16000+ Real-World APIs",
    396       "authors": ["Yujia Qin", "Shihao Liang", "Yining Ye"],
    397       "year": 2023,
    398       "arxiv_id": "2307.16789",
    399       "relevance": "Large-scale tool use by LLMs, relevant to understanding agentic AI capabilities."
    400     },
    401     {
    402       "title": "ChatEval: Towards Better LLM-based Evaluators through Multi-Agent Debate",
    403       "authors": ["Chi-Min Chan", "Weize Chen", "Yusheng Su"],
    404       "year": 2023,
    405       "relevance": "Multi-agent LLM evaluation framework, relevant to LLM-as-judge methodology."
    406     },
    407     {
    408       "title": "Cognitive Architectures for Language Agents",
    409       "authors": ["Theodore R. Sumers", "Shunyu Yao", "Karthik Narasimhan"],
    410       "year": 2023,
    411       "arxiv_id": "2309.02427",
    412       "relevance": "Theoretical framework for LLM agent architectures including memory and planning, directly relevant to agentic AI design."
    413     },
    414     {
    415       "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
    416       "authors": ["Erik Nijkamp", "Bo Pang", "Hiroaki Hayashi"],
    417       "year": 2023,
    418       "relevance": "Open-source code generation model with multi-turn synthesis, relevant to code generation evaluation."
    419     },
    420     {
    421       "title": "Exchange-of-Thought: Enhancing Large Language Model Capabilities through Cross-Model Communication",
    422       "authors": ["Zhangyue Yin", "Qiushi Sun", "Cheng Chang"],
    423       "year": 2023,
    424       "relevance": "Cross-model communication to enhance LLM capabilities, relevant to multi-agent collaboration mechanisms."
    425     },
    426     {
    427       "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents",
    428       "authors": ["Shuyan Zhou", "Frank F Xu", "Hao Zhu"],
    429       "year": 2023,
    430       "arxiv_id": "2307.13854",
    431       "relevance": "Benchmark environment for autonomous web agents, relevant to agent evaluation methodology."
    432     }
    433   ]
    434 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs