scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26679B)
      1 {
      2   "paper": {
      3     "title": "CAMEL: Communicative Agents for \"Mind\" Exploration of Large Language Model Society",
      4     "authors": ["Guohao Li", "Hasan Abed Al Kader Hammoud", "Hani Itani", "Dmitrii Khizbullin", "Bernard Ghanem"],
      5     "year": 2023,
      6     "venue": "NeurIPS 2023",
      7     "arxiv_id": "2303.17760"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper provides a GitHub link (https://github.com/camel-ai/camel) in the abstract and states 'we have open-sourced our library, containing implementations of various agents, data generation pipelines, data analysis tools, and collected datasets.' Datasets are also on HuggingFace (https://huggingface.co/camel-ai)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states 'All the generated datasets are made available on HuggingFace: https://huggingface.co/camel-ai' (Appendix J). The AI Society, Code, Math, and Science datasets are publicly released."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions using gpt-3.5-turbo, GPT4, and LLaMA-7B, and lists training hardware (4xA100-80GB GPUs) and CPU specs in Appendix K.3, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While prompt templates are provided and the framework is described, the paper does not include step-by-step reproduction instructions, specific commands to run, or a 'Reproducing Results' section. The reader would need to consult the GitHub repository separately."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The evaluation results in Table 1 report only point estimates (e.g., CAMEL Agents Win: 76.3% in human evaluation, 73.0% in GPT4 evaluation). No confidence intervals or error bars are provided for any results."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims CAMEL 'outperforms gpt-3.5-turbo single-shot solution in both the human evaluation and the GPT4 evaluation by a big margin' (Section 5.1) but provides no statistical significance tests to support this claim."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Table 1 provides win/loss/draw percentages with enough context for the reader to assess the magnitude: CAMEL Agents Win 76.3% vs gpt-3.5-turbo Wins 10.4% in human evaluation, and 73.0% vs 23.0% in GPT4 evaluation. Table 3 reports pass@k scores for HumanEval benchmarks with specific values."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper evaluates on 100 randomly selected tasks from AI Society and 100 from Code (Section 5.1), and 453 human responses were collected, but no justification is given for why these sample sizes are adequate. The LLaMA evaluation uses only 20 tasks per domain (except 60 for science) with no justification."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. The human evaluation, GPT4 evaluation, and HumanEval results are all single-run point estimates."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares CAMEL agent solutions against gpt-3.5-turbo single-shot solutions (Table 1) and compares CAMEL-7B against LLaMA-7B and Vicuna-7B on HumanEval/HumanEval+ (Table 3)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The baselines (gpt-3.5-turbo, LLaMA-7B, Vicuna-7B) were all contemporary models at the time of publication in 2023. However, the comparison is only against single-shot gpt-3.5-turbo, not against other multi-agent or chain-of-thought approaches."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Appendix J presents ablation studies on prompt design: Prompt V2 (removing assistant output format) and Prompt V1 + Task Planner. The ablations analyze effects on conversation termination distribution and flake messages (Figures 9 and 10)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses both human evaluation and GPT4 evaluation for the agent comparison (Table 1), and reports pass@1 and pass@100 on both HumanEval and HumanEval+ for code evaluation (Table 3). The progressive fine-tuning evaluation uses win/draw/loss across multiple domains."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 5.1 describes a human evaluation: 'we present both the CAMEL summarized agent solution and the gpt-3.5-turbo single-shot solution side-by-side to human participants... A total of 453 responses were collected.'"
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "For the main agent evaluation, 100 tasks were 'randomly selected' from the generated datasets but there is no explicit train/test split or held-out set methodology described. For HumanEval, it is a standard benchmark, but for the progressive fine-tuning evaluation (Table 2), the test questions were generated separately, though the methodology for ensuring no overlap is not discussed."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 2 provides per-domain breakdowns (AI Society, Code, Math, Science) for the progressive fine-tuning experiments. The agent evaluation is broken down by dataset (AI Society vs Code) and evaluation type (Human vs GPT4) in Table 1."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4.1 and Appendix G extensively discuss four types of failures: role flipping, assistant repeating instructions, flake replies, and infinite loop of messages. Concrete examples are provided in Appendix G (Figure 7)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports negative findings including challenges where agents fail (role flipping, infinite loops). The ablation in Appendix J shows that Prompt V2 and Task Planner increase flake messages despite reducing other issues (Figure 10). The 'Bad Mind' example (Appendix B) demonstrates misalignment risks."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims about autonomous cooperation, role-playing framework, conversational data generation, and open-sourcing are supported. The claim that 'solutions derived from our role-playing framework outperform those generated in a single shot by gpt-3.5-turbo' is supported by Table 1 results, though without statistical significance testing."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims such as 'utilizing a multi-agent cooperative approach is more effective than gpt-3.5-turbo's single shot solution' (Table 1 caption) and 'These findings underscore the critical role played by the generated datasets in enhancing LLaMA's ability' (Section 5.3). However, there is no controlled comparison isolating the multi-agent mechanism from other confounds (e.g., the CAMEL solution was summarized by GPT-4, introducing a confound)."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title and abstract use broad language like 'Mind Exploration of Large Language Model Society' and 'scalable techniques to facilitate autonomous cooperation among communicative agents.' The framework is only tested with gpt-3.5-turbo agents on specific task types, but the claims are framed broadly. The paper states 'we focus on studying communicative agents under cooperative settings' (Section 3) which provides some bounding, but the title and framing exceed the tested scope."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for why CAMEL outperforms single-shot solutions. A key confound is that CAMEL solutions are summarized by GPT-4 before evaluation, meaning GPT-4 summarization quality could explain the improvement rather than multi-agent cooperation itself. This confound is not discussed."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper uses 'gpt-3.5-turbo' and 'GPT4' without specifying exact model versions or snapshot dates (e.g., no 'gpt-3.5-turbo-0301' or 'gpt-4-0314'). LLaMA-7B is specified by size but not by a specific checkpoint version."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper provides full prompt templates with placeholders in Figures 2, 3, 4, and 5, as well as the actual evaluation prompt (Appendix H), solution extraction prompt (Appendix H), and embodied agent prompt (Figure 15). The fill values for placeholders are also described (e.g., 50 roles listed in Appendix E). The prompts are detailed enough to reconstruct the system."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Table 5 in Appendix K.2 provides detailed training hyperparameters for LLaMA fine-tuning: learning rate 2e-5, epochs 3, batch size 4, etc. However, the temperature/sampling settings for gpt-3.5-turbo and GPT-4 API calls are not reported."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The role-playing framework is described in detail in Section 3: inception prompting, role assignment, conversation flow with formal notation (Equations 1-4), termination conditions (Section 4.1), and the critic-in-the-loop extension (Appendix O). The full workflow is depicted in Figure 1."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The data generation pipeline is documented: role generation (50 assistant roles, 50 user roles), task generation (10 tasks per role pair = 25,000 conversations), task specification, and conversation generation. The prompts used at each stage are provided (Figures 3, 5). For Math and Science, the generation process is detailed in Appendix F with Table 4."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Appendix K.1 ('Broader Impacts and Limitations') provides a dedicated discussion of risks, limitations, and future work, including potential misuse, evaluation challenges, and cost limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The limitations section discusses specific threats: 'existing LLMs are not fully tuned to be harmless, they can be easily exploited by malicious users'; 'evaluating its task completion capabilities poses a challenge that necessitates the involvement of numerous domain experts'; 'Human evaluators may have a preference for longer answers'; 'Large language models used in our framework may produce false information.'"
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "While the paper notes it only tested with two-agent settings and mentions extending to more agents as future work, it does not clearly state what the results do NOT show. There is no explicit bounding of which tasks, models, or domains the claims hold for versus not."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The generated datasets (AI Society, Code, Math, Science) are available on HuggingFace (https://huggingface.co/camel-ai), allowing independent verification of the conversational data."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The data collection procedure is described in detail: LLM-generated roles, tasks, task specification, and multi-turn conversations. The exact prompts, number of roles (50+50), tasks per pair (10), and total conversations (25,000 for AI Society) are specified. Math and Science generation is documented in Appendix F."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "For the human evaluation (453 responses), the paper does not describe how human participants were recruited, what channels were used, or what background they had. The evaluation guideline is provided (Appendix K.5) but recruitment methods are absent."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from idea generation through role generation, task generation, task specification, conversation generation, and termination is documented with prompts and counts at each stage. Termination reason distributions are analyzed in Appendix J (Figure 8)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 7 (Acknowledgements) states: 'This work was supported by SDAIA-KAUST Center of Excellence in Data Science and Artificial Intelligence (SDAIA-KAUST AI).'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are listed as affiliated with King Abdullah University of Science and Technology (KAUST). The paper evaluates OpenAI models but the authors are not affiliated with OpenAI."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The funder (SDAIA-KAUST AI) is a research center with no apparent financial stake in the specific results about multi-agent cooperation or OpenAI model performance."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates CAMEL-7B (fine-tuned LLaMA) on HumanEval and HumanEval+ benchmarks but does not state the training data cutoff for LLaMA-7B or gpt-3.5-turbo."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The paper fine-tunes LLaMA on CAMEL-generated data and evaluates on HumanEval, but does not discuss whether HumanEval problems could have been in LLaMA's pretraining data or in the CAMEL-generated code data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "HumanEval was published in 2021 and LLaMA was trained on data that likely includes it. The paper does not address this contamination risk when reporting HumanEval results in Table 3."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The human evaluation study is not pre-registered. No link to any pre-registration is provided."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No IRB or ethics board approval is mentioned for the human evaluation study involving 453 participants."
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No demographics of the 453 human evaluators are reported. There is no information about their background, expertise, or any other characteristics."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No inclusion or exclusion criteria for human evaluators are stated. The paper does not describe who was eligible to participate."
    253       },
    254       "randomization_described": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "The paper states solutions were presented 'side-by-side' and 'the identity behind each solution is not revealed,' but does not describe whether the left/right ordering was randomized across tasks or participants."
    258       },
    259       "blinding_described": {
    260         "applies": true,
    261         "answer": true,
    262         "justification": "Section 5.1 states: 'we present both the CAMEL summarized agent solution and the gpt-3.5-turbo single-shot solution side-by-side to human participants. The identity behind each solution is not revealed.' This describes single-blinding of evaluators."
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No information is provided about how many evaluators started versus finished, or whether any responses were excluded."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper mentions cost concerns ('the cost of generated chats', 'the cost grows quadratically with the length of the conversation') but does not report actual API costs, tokens consumed, or total data generation costs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Appendix K.3 states: 'For training the models we used 4xA100-80GB GPUs. For generating the data we used devices equipped with Intel(R) Xeon(R) Gold 6242 CPU @ 2.80GHz.' However, total GPU hours and API spend are not quantified."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CAMEL agent solutions outperform gpt-3.5-turbo single-shot solutions in both human and GPT4 evaluations.",
    286       "evidence": "Table 1: Human evaluation shows CAMEL wins 76.3% vs gpt-3.5-turbo wins 10.4% (AI Society). GPT4 evaluation shows CAMEL wins 73.0% vs gpt-3.5-turbo wins 23.0% (AI Society) and 76.0% vs 24.0% (Code).",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "Progressive fine-tuning on CAMEL datasets demonstrates emergence of knowledge in LLaMA-7B across domains.",
    291       "evidence": "Table 2 shows that fine-tuning on progressively more datasets (AI Society, Code, Math, Science) improves performance on the newly added domain as measured by GPT4 preference evaluation on 20-60 test questions per domain.",
    292       "supported": "weak"
    293     },
    294     {
    295       "claim": "CAMEL-7B (LLaMA fine-tuned on all CAMEL datasets) outperforms LLaMA-7B and Vicuna-7B on HumanEval.",
    296       "evidence": "Table 3: CAMEL-7B achieves 14.0% pass@1 and 57.9% pass@100 on HumanEval, compared to LLaMA-7B's 10.5%/36.5% and Vicuna-7B's 11.0%/42.9%.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Inception prompting addresses challenges of autonomous agent cooperation including role flipping, flake replies, and infinite loops.",
    301       "evidence": "Section 4.1 identifies these four challenges. The termination conditions and prompt design are shown to mitigate them. Ablation in Appendix J (Figures 9-10) shows tradeoffs between different prompt designs, though original Prompt V1 has lowest flake rate.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Human and GPT4 evaluations are 'highly aligned'.",
    306       "evidence": "Table 1 shows similar trends: both prefer CAMEL (76.3% human vs 73.0% GPT4 for AI Society), but no statistical test of alignment/correlation is provided.",
    307       "supported": "weak"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval", "case-study"],
    311   "key_findings": "CAMEL introduces a role-playing framework for autonomous multi-agent cooperation using inception prompting, where an AI user and AI assistant collaborate on tasks with minimal human intervention. The framework generates large conversational datasets (AI Society: 25K conversations, Code, Math, Science) that are released publicly. Multi-agent solutions outperform single-shot gpt-3.5-turbo solutions in both human and GPT4 evaluations (76.3% win rate). Progressive fine-tuning of LLaMA-7B on CAMEL datasets shows knowledge transfer across domains, with CAMEL-7B achieving 14.0% pass@1 on HumanEval versus 10.5% for base LLaMA-7B.",
    312   "red_flags": [
    313     {
    314       "flag": "GPT-4 summarization confound",
    315       "detail": "CAMEL multi-turn solutions are summarized by GPT-4 before comparison with gpt-3.5-turbo single-shot solutions. The improvement could be partly attributed to GPT-4 summarization quality rather than multi-agent cooperation. This confound is not discussed."
    316     },
    317     {
    318       "flag": "No statistical significance testing",
    319       "detail": "All evaluation results are reported as point estimates without confidence intervals, error bars, or statistical tests. Claims of one method outperforming another are based on raw percentage comparisons."
    320     },
    321     {
    322       "flag": "LLM-as-judge evaluation concerns",
    323       "detail": "A large portion of the evaluation relies on GPT-4 as a judge, which introduces potential biases (e.g., preference for verbosity, self-preference). The alignment between human and GPT4 evaluation is claimed but not statistically tested."
    324     },
    325     {
    326       "flag": "Very small evaluation sample for knowledge emergence",
    327       "detail": "The progressive fine-tuning knowledge emergence claim (Table 2) is based on only 20 test questions per domain (60 for science), evaluated by GPT-4. This is too small for reliable conclusions about emergent knowledge."
    328     },
    329     {
    330       "flag": "Missing human evaluator recruitment details",
    331       "detail": "453 human evaluation responses are reported but no information is given about who the evaluators were, how they were recruited, their expertise, or whether any quality controls were applied."
    332     },
    333     {
    334       "flag": "Benchmark contamination unaddressed",
    335       "detail": "CAMEL-7B is evaluated on HumanEval (published 2021) without discussing whether these problems appeared in LLaMA's pretraining data or in the CAMEL-generated Code dataset."
    336     }
    337   ],
    338   "cited_papers": [
    339     {
    340       "title": "Evaluating large language models trained on code",
    341       "authors": ["Mark Chen"],
    342       "year": 2021,
    343       "arxiv_id": "2107.03374",
    344       "relevance": "HumanEval benchmark used to evaluate CAMEL-7B code generation capabilities."
    345     },
    346     {
    347       "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation",
    348       "authors": ["Jiawei Liu"],
    349       "year": 2023,
    350       "arxiv_id": "2305.01210",
    351       "relevance": "HumanEval+ benchmark used for more rigorous code generation evaluation."
    352     },
    353     {
    354       "title": "Training language models to follow instructions with human feedback",
    355       "authors": ["Long Ouyang"],
    356       "year": 2022,
    357       "relevance": "InstructGPT/RLHF methodology underlying the instruction-following LLMs used in CAMEL."
    358     },
    359     {
    360       "title": "LLaMA: Open and efficient foundation language models",
    361       "authors": ["Hugo Touvron"],
    362       "year": 2023,
    363       "arxiv_id": "2302.13971",
    364       "relevance": "Base model fine-tuned on CAMEL datasets for knowledge emergence experiments."
    365     },
    366     {
    367       "title": "Vicuna: An open-source chatbot impressing gpt-4 with 90% chatgpt quality",
    368       "authors": ["Wei-Lin Chiang"],
    369       "year": 2023,
    370       "relevance": "Baseline model compared against CAMEL-7B on HumanEval benchmarks."
    371     },
    372     {
    373       "title": "Self-instruct: Aligning language model with self generated instructions",
    374       "authors": ["Yizhong Wang"],
    375       "year": 2022,
    376       "arxiv_id": "2212.10560",
    377       "relevance": "Prior work on automated instruction generation that CAMEL's approach builds upon."
    378     },
    379     {
    380       "title": "Toolformer: Language models can teach themselves to use tools",
    381       "authors": ["Timo Schick"],
    382       "year": 2023,
    383       "arxiv_id": "2302.04761",
    384       "relevance": "Related work on LLM tool use that informs CAMEL's embodied agent extension."
    385     },
    386     {
    387       "title": "ReAct: Synergizing reasoning and acting in language models",
    388       "authors": ["Shunyu Yao"],
    389       "year": 2023,
    390       "relevance": "Prompting method for LLM reasoning and decision-making, related to CAMEL's inception prompting approach."
    391     },
    392     {
    393       "title": "Reflexion: an autonomous agent with dynamic memory and self-reflection",
    394       "authors": ["Noah Shinn"],
    395       "year": 2023,
    396       "arxiv_id": "2303.11366",
    397       "relevance": "Autonomous agent framework with self-reflection capabilities, related to CAMEL's multi-agent approach."
    398     },
    399     {
    400       "title": "Constitutional AI: Harmlessness from AI feedback",
    401       "authors": ["Yuntao Bai"],
    402       "year": 2022,
    403       "arxiv_id": "2212.08073",
    404       "relevance": "AI alignment methodology relevant to CAMEL's discussion of agent alignment and the misalignment dataset."
    405     },
    406     {
    407       "title": "Cooperative AI: machines must learn to find common ground",
    408       "authors": ["Allan Dafoe"],
    409       "year": 2021,
    410       "relevance": "Foundational work on cooperative AI that motivates CAMEL's multi-agent cooperation framework."
    411     },
    412     {
    413       "title": "WebArena: A realistic web environment for building autonomous agents",
    414       "authors": ["Shuyan Zhou"],
    415       "year": 2023,
    416       "arxiv_id": "2307.13854",
    417       "relevance": "Web-based agent evaluation environment relevant to evaluating agentic AI systems."
    418     }
    419   ]
    420 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs