scan.json (29922B)
1 { 2 "paper": { 3 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 4 "authors": [ 5 "Qingyun Wu", 6 "Gagan Bansal", 7 "Jieyu Zhang", 8 "Yiran Wu", 9 "Beibin Li", 10 "Erkang Zhu", 11 "Li Jiang", 12 "Xiaoyun Zhang", 13 "Shaokun Zhang", 14 "Jiale Liu", 15 "Ahmed Awadallah", 16 "Ryen W. White", 17 "Doug Burger", 18 "Chi Wang" 19 ], 20 "year": 2023, 21 "venue": "arXiv", 22 "arxiv_id": "2308.08155" 23 }, 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The paper provides a GitHub link: https://github.com/microsoft/autogen in the abstract (footnote 2). This is a working, public repository." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "The evaluations use publicly available benchmarks: MATH dataset (Hendrycks et al., 2021), Natural Questions (Kwiatkowski et al., 2019), ALFWorld (Shridhar et al., 2021), and MiniWoB++ (Shi et al., 2017). No proprietary datasets were created that would need separate release." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. The paper mentions pre-installing 'sympy' for math experiments but does not provide comprehensive dependency or environment specifications." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "While code is released and the paper describes workflows conceptually, there are no step-by-step reproduction instructions, scripts to replicate specific experiments, or a 'Reproducing Results' section. The paper describes configurations at a high level but does not provide specific commands or scripts to reproduce the benchmark results." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "Main results in Figures 4a-4d report only point estimates (e.g., '52.5%', '69.48%') with no confidence intervals or error bars. The OptiGuide Table 4 reports standard deviations for user interaction savings, but the primary benchmark results lack uncertainty quantification." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper claims AutoGen outperforms baselines (e.g., 'the highest problem-solving success rate among all compared methods') but provides no statistical significance tests. All comparisons are based on raw number differences." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Effect sizes are reported with baseline context: e.g., '15% performance gain on average' for grounding agent (Section A3), 'multi-agent design boosts the F-1 score by 8% (with GPT-4) and 35% (with GPT-3.5-turbo)' (Section A4), and '3x saving on user's time' (Section A4). Results include specific percentages with baselines visible in figures." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "No justification is given for sample sizes. The MATH evaluation uses '120 randomly selected level-5 problems' without justification for this number. The qualitative evaluation uses only 2 problems tested 3 times each. The dynamic group chat pilot study uses 12 manually crafted tasks. The OptiGuide user study involves only a single expert participant. None of these sample sizes are justified." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "The ALFWorld results report 'average' and 'best of 3' across 3 attempts, but no standard deviation or variance is reported for the main benchmark results (MATH, Natural Questions, ALFWorld, MiniWoB++). Table 4 reports standard deviations for the OptiGuide saving ratio, but this is a secondary metric. The primary success rates lack variance reporting." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "Multiple baselines are included across applications: vanilla GPT-4, ChatGPT+Code Interpreter, ChatGPT+Plugin, LangChain ReAct, Multi-Agent Debate for A1; DPR for A2; ReAct for A3; single-agent vs multi-agent for A4; RCI for A7 (MiniWoB++)." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "Baselines are contemporary for 2023: ChatGPT+Code Interpreter, GPT-4 (released 2023), Multi-Agent Debate (Liang et al., 2023), MetaGPT (Hong et al., 2023), RCI (Kim et al., 2023). These represent the state of the art at the time of writing." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": true, 88 "justification": "Several ablation studies are conducted: interactive retrieval vs. non-interactive in A2 (Figure 4b); two-agent vs. three-agent (with grounding agent) in A3 (Table 3); single-agent vs. multi-agent in A4 (Figure 4d); with vs. without board agent in A6; role-play vs. task-based speaker selection in A5 (Tables 5, 6)." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "Multiple metrics are used: success ratio for MATH (A1); F1 and Recall for Q&A (A2, Figure 4b); success ratio per category for ALFWorld (A3, Table 3); F1 and Recall for OptiGuide (A4, Figure 4d); number of successes, LLM calls, and termination failures for dynamic group chat (A5, Tables 5-6)." 94 }, 95 "human_evaluation": { 96 "applies": true, 97 "answer": true, 98 "justification": "A user study comparing ChatGPT+Code Interpreter vs. AutoGen-based OptiGuide is conducted in A4 with an expert Python programmer, measuring time and accuracy on 10 questions. The qualitative evaluations in A1 also involve manual assessment of failure reasons." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": true, 103 "justification": "The MATH evaluation uses the standard test dataset (5000 problems). ALFWorld uses '134 unseen tasks.' MiniWoB++ uses all available tasks from the official RCI code. These are standard held-out evaluation sets from published benchmarks." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "ALFWorld results (Table 3) provide per-category breakdown across Pick, Clean, Heat, Cool, Look, and Pick 2 task types. MiniWoB++ (Figure 18) shows per-task success rates for all 49 tasks." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "Failure cases are extensively discussed: Table 2 provides qualitative failure analysis for math problems; Figure 10 shows a failure trajectory in ALFWorld; Table 7 provides case analysis of typical failure tasks in MiniWoB++; the paper discusses loops and common error patterns." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "Negative results are reported: the two-agent system for ALFWorld does not outperform ReAct (both at 54% average); MiniWobChat achieves 52.8% which is 3.6% lower than RCI; preliminary evaluations show BabyAGI, CAMEL, and MetaGPT are 'not suitable choices for solving math problems out of the box.'" 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "The abstract claims AutoGen agents are 'customizable, conversable' (demonstrated through framework design in Section 2), can 'operate in various modes' (shown through six applications in Section 3), and 'empirical studies demonstrate the effectiveness' (supported by benchmark results in Figures 4a-4d). Claims are appropriately hedged." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "Causal claims are primarily made through ablation studies which constitute controlled single-variable manipulations: adding/removing grounding agent (A3), interactive vs. non-interactive retrieval (A2), single vs. multi-agent (A4), with vs. without board agent (A6). These are adequate designs for the causal claims being made." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": false, 135 "justification": "The title and abstract claim AutoGen is a framework for 'diverse applications of various complexities and LLM capacities,' but empirical results are limited to specific benchmarks (MATH, Natural Questions, ALFWorld, MiniWoB++, OptiGuide, 12 manually crafted tasks, and chess). The paper does not explicitly bound its generalization claims to these tested domains. The framework claims are broader than what is empirically tested." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper does not discuss alternative explanations for observed performance gains. For example, the improvement from the grounding agent in ALFWorld could be due to additional prompt information rather than the multi-agent architecture. The OptiGuide user study with a single expert does not discuss alternative explanations for time savings. No threats-to-validity section exists." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper uses 'GPT-4' and 'GPT-3.5-turbo' throughout without specifying snapshot dates or API versions (e.g., 'gpt-4-0613'). The only specific version mentioned is 'text-davinci-003' for ReAct (from the official code, not the authors' choice). No model version identifiers are provided for the AutoGen experiments." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": true, 152 "justification": "The default system message for the AssistantAgent is provided in full in Figure 5 (Appendix C). The retrieval-augmented chat prompt is shown in Figure 8. The ALFWorld few-shot prompts are referenced as obtained from the ReAct repository. While not every prompt is shown, the primary system prompts are provided in full." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": false, 157 "justification": "No temperature, top-p, max tokens, or other LLM inference hyperparameters are reported anywhere in the paper. The paper mentions configuring agents but does not specify the inference settings used for experiments." 158 }, 159 "scaffolding_described": { 160 "applies": true, 161 "answer": true, 162 "justification": "The multi-agent scaffolding is described in detail: Section 2 explains the conversable agent design, auto-reply mechanisms, conversation programming paradigm, and control flow. Each application section describes the specific agent topology, roles, and interaction patterns (e.g., Commander/Writer/Safeguard in A4, grounding agent in A3)." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Data preprocessing is documented for key experiments: MATH uses '120 randomly selected level-5 problems' from 6 categories excluding geometry; Natural Questions uses '5,332 non-redundant context documents and 6,775 queries from HuggingFace'; ALFWorld uses '134 unseen tasks'; OptiGuide uses '100 coding tasks crafted to include equal numbers of safe and unsafe tasks.'" 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": false, 174 "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. The Discussion (Section 4) mentions future directions and the Ethics Statement discusses potential concerns, but neither constitutes a substantive limitations discussion addressing the empirical evaluations." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": false, 179 "justification": "No specific threats to validity are discussed. The paper does not address potential confounds in its evaluations, small sample sizes in qualitative studies, or the single-participant user study. The Ethics Statement discusses general ethical considerations but not study-specific validity threats." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "The paper states 'this work is still in its early experimental stages' (Section 4) but does not explicitly state what the results do NOT show. It does not bound claims to specific domains, model types, or task complexities tested. The Future Work section (Appendix B.2) raises open questions but does not state specific scope limitations of the current results." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "Raw experimental outputs (model responses, conversation logs, intermediate results) are not available for independent verification. Only selected example outputs are shown in Appendix E. There is no data repository or supplementary data files." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Data collection procedures are described: MATH uses a random selection from the standard dataset; Natural Questions uses the HuggingFace version; ALFWorld uses the standard 134 unseen tasks; OptiGuide uses a crafted dataset of 100 tasks. The A5 pilot study uses '12 manually crafted complex tasks' with examples provided." 197 }, 198 "recruitment_methods_described": { 199 "applies": true, 200 "answer": false, 201 "justification": "The OptiGuide user study (A4) mentions 'an expert Python programmer with proficiency in Gurobi participated in the test' but provides no recruitment details — how they were selected, their relationship to the project, or potential selection bias. This is relevant since the single participant's expertise could bias the results." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": false, 206 "justification": "The data pipeline from raw experimental runs to reported results is not fully documented. For example, the MATH evaluation does not explain how the 120 level-5 problems were randomly selected, what random seed was used, or whether any runs were excluded. The MiniWoB++ evaluation references 'all available tasks in the official RCI code' without detailing which tasks those are." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "The Acknowledgements section states: 'Qingyun Wu would like to acknowledge the funding and research support from the College of Information Science and Technology at Penn State University.' Microsoft Research affiliation is listed for the majority of authors." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly listed: Microsoft Research, Pennsylvania State University, University of Washington, and Xidian University. The Microsoft affiliation is prominent given that AutoGen is a Microsoft Research project." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "The majority of authors are from Microsoft Research, and AutoGen is a Microsoft Research project (hosted at github.com/microsoft/autogen). Microsoft has a direct financial and reputational interest in the success of AutoGen. The funder (Microsoft) is not independent of the outcome." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests statement is provided. There is no declaration of financial interests, patents, or equity related to the work. Given that AutoGen is a Microsoft product, this disclosure would be relevant." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "The paper uses GPT-4 and GPT-3.5-turbo on benchmarks like MATH (published 2021) and Natural Questions (published 2019) but does not state the training data cutoff date for any model used. The only hint is mentioning that FLAML Spark APIs added in December 2022 are 'not encompassed in the GPT-4 training data,' but no explicit cutoff date is stated." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": false, 240 "justification": "No discussion of potential train/test overlap. MATH (2021), Natural Questions (2019), and ALFWorld (2021) are all public benchmarks that could have been in GPT-4's training data. This is not addressed." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": false, 245 "justification": "The benchmarks used (MATH, Natural Questions, ALFWorld, MiniWoB++) were all published before GPT-4's likely training cutoff. The paper does not discuss contamination risk despite this being a known issue for these benchmarks." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": true, 251 "answer": false, 252 "justification": "The OptiGuide user study (A4) involving a human participant is not pre-registered. No pre-registration link is provided." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": true, 256 "answer": false, 257 "justification": "The OptiGuide user study involving a human participant does not mention IRB or ethics board approval." 258 }, 259 "demographics_reported": { 260 "applies": true, 261 "answer": false, 262 "justification": "The single user study participant is described only as 'an expert Python programmer with proficiency in Gurobi.' No further demographics (experience level, years of experience, relationship to the project, etc.) are reported." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": true, 266 "answer": false, 267 "justification": "No inclusion or exclusion criteria for the user study participant are stated. The participant was simply described as 'an expert Python programmer with proficiency in Gurobi.'" 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "The user study is not an experimental study with multiple conditions assigned to different participants; the same participant used both systems. Randomization of task order is not described but this is more of an observational comparison than an RCT." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "Blinding is not feasible in this user study since the participant interacts directly with both systems (ChatGPT+Code Interpreter vs. OptiGuide) and would necessarily know which system they are using." 278 }, 279 "attrition_reported": { 280 "applies": true, 281 "answer": false, 282 "justification": "With only a single participant, attrition is not explicitly discussed. The paper does not state whether additional participants were planned or whether the study was designed for just one participant." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "No API costs or token consumption is reported. The paper notes 'approximately 19.4% of questions trigger an Update Context operation, resulting in additional LLM calls' (A2) and reports LLM call counts in Table 6, but does not translate these into dollar costs or token counts." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "No total computational budget, GPU hours, total API spend, or hardware specifications are reported. The evaluations involve multiple GPT-4 API calls across thousands of benchmark examples, but the total cost is not quantified." 295 } 296 } 297 }, 298 "claims": [ 299 { 300 "claim": "AutoGen achieves 69.48% overall accuracy on the full MATH test dataset, compared to 55.18% for vanilla GPT-4.", 301 "evidence": "Section A1 (Appendix D) reports quantitative evaluation on the entire MATH test set (5000 problems) with results shown in Figure 4a.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "AutoGen with interactive retrieval outperforms DPR and AutoGen without interactive retrieval on the Natural Questions QA task.", 306 "evidence": "Figure 4b shows F1 and Recall metrics: AutoGen (25.88% F1, 66.65% Recall) vs. DPR (22.79% F1, 62.59% Recall) vs. AutoGen W/O interactive retrieval (15.12% F1, 58.56% Recall). An ablation confirms the interactive retrieval mechanism's role.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "Adding a grounding agent to the ALFWorld system brings a 15% performance gain on average over the two-agent design.", 311 "evidence": "Table 3 shows 3-agent system at 69% average success rate vs. 54% for 2-agent system across 134 unseen ALFWorld tasks. Figure 10 provides a qualitative case study.", 312 "supported": "moderate" 313 }, 314 { 315 "claim": "Multi-agent design boosts F1 score in identifying unsafe code by 8% (GPT-4) and 35% (GPT-3.5-turbo) compared to single-agent.", 316 "evidence": "Figure 4d shows the comparison on 100 coding tasks (50 safe, 50 unsafe) in the OptiGuide application.", 317 "supported": "moderate" 318 }, 319 { 320 "claim": "AutoGen-based OptiGuide saves around 3x of user's time and reduces user interactions by 3-5 times compared to ChatGPT+Code Interpreter.", 321 "evidence": "Section A4 reports a user study with a single expert participant on 10 questions, with time measurements (4min 35sec vs 1.5min). Table 4 reports interaction saving ratios across 5 OptiGuide applications.", 322 "supported": "weak" 323 }, 324 { 325 "claim": "AutoGen's core workflow code for OptiGuide was reduced from over 430 lines to 100 lines.", 326 "evidence": "Stated in Section A4, but the comparison methodology (what constitutes 'core workflow code') is not rigorously defined.", 327 "supported": "weak" 328 } 329 ], 330 "methodology_tags": [ 331 "benchmark-eval", 332 "case-study" 333 ], 334 "key_findings": "AutoGen is an open-source multi-agent conversation framework from Microsoft Research that introduces conversable agents and conversation programming paradigms. Across six diverse applications (math problem solving, retrieval-augmented QA, text-world decision making, multi-agent coding, dynamic group chat, and conversational chess), the framework demonstrates competitive or superior performance compared to existing approaches. Key empirical results include 69.48% accuracy on MATH (vs. 55.18% for vanilla GPT-4), a 15% success rate improvement on ALFWorld from adding a grounding agent, and competitive performance on MiniWoB++ compared to the specialized RCI method.", 335 "red_flags": [ 336 { 337 "flag": "Vendor self-evaluation", 338 "detail": "The majority of authors are from Microsoft Research, and AutoGen is a Microsoft Research project. The paper evaluates AutoGen favorably against competitors without acknowledging this conflict of interest or including independent evaluation." 339 }, 340 { 341 "flag": "Single-participant user study", 342 "detail": "The OptiGuide user study (A4) uses a single 'expert Python programmer with proficiency in Gurobi' to claim 3x time savings. This sample size is far too small for generalizable claims, and the participant's expertise and relationship to the project are not disclosed." 343 }, 344 { 345 "flag": "No uncertainty quantification on main results", 346 "detail": "Primary benchmark results (MATH, Natural Questions, ALFWorld, MiniWoB++) are reported as single point estimates without confidence intervals, error bars, or significance tests. This makes it impossible to assess whether observed differences are meaningful." 347 }, 348 { 349 "flag": "Benchmark contamination risk unaddressed", 350 "detail": "GPT-4 is evaluated on MATH (2021), Natural Questions (2019), and ALFWorld (2021), all published before GPT-4's training cutoff. The paper does not discuss whether these benchmarks may have been in the training data." 351 }, 352 { 353 "flag": "Small and manual pilot study for group chat", 354 "detail": "The dynamic group chat (A5) evaluation uses only 12 manually crafted tasks. These tasks may be biased toward scenarios where AutoGen's group chat excels, and the sample size is too small for robust conclusions." 355 }, 356 { 357 "flag": "Missing model version specifications", 358 "detail": "All experiments use 'GPT-4' and 'GPT-3.5-turbo' without snapshot dates or API versions. Given that model behavior changes across versions, results may not be reproducible." 359 } 360 ], 361 "cited_papers": [ 362 { 363 "title": "Improving factuality and reasoning in language models through multiagent debate", 364 "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba", "Joshua B Tenenbaum", "Igor Mordatch"], 365 "year": 2023, 366 "arxiv_id": "2305.14325", 367 "relevance": "Foundational work on multi-agent debate for improving LLM factuality, directly compared as a baseline in AutoGen's math evaluation." 368 }, 369 { 370 "title": "Encouraging divergent thinking in large language models through multi-agent debate", 371 "authors": ["Tian Liang", "Zhiwei He", "Wenxiang Jiao"], 372 "year": 2023, 373 "relevance": "Multi-agent debate framework showing divergent thinking benefits, compared as a baseline in AutoGen." 374 }, 375 { 376 "title": "MetaGPT: Meta programming for multi-agent collaborative framework", 377 "authors": ["Sirui Hong", "Xiawu Zheng", "Jonathan Chen"], 378 "year": 2023, 379 "arxiv_id": "2308.00352", 380 "relevance": "Specialized multi-agent framework for software development, compared with AutoGen in Table 1 and preliminary math evaluations." 381 }, 382 { 383 "title": "CAMEL: Communicative agents for 'mind' exploration of large scale language model society", 384 "authors": ["Guohao Li", "Hasan Abed Al Kader Hammoud", "Hani Itani"], 385 "year": 2023, 386 "relevance": "Multi-agent communicative framework using role-playing and inception prompting, compared with AutoGen in Table 1." 387 }, 388 { 389 "title": "Voyager: An open-ended embodied agent with large language models", 390 "authors": ["Guanzhi Wang", "Yuqi Xie", "Yunfan Jiang"], 391 "year": 2023, 392 "arxiv_id": "2305.16291", 393 "relevance": "Open-ended LLM agent for embodied tasks, relevant as a specialized single-agent approach for comparison." 394 }, 395 { 396 "title": "Generative agents: Interactive simulacra of human behavior", 397 "authors": ["Joon Sung Park", "Joseph C O'Brien", "Carrie J Cai"], 398 "year": 2023, 399 "arxiv_id": "2304.03442", 400 "relevance": "Multi-agent simulation of human behavior using LLMs, relevant to understanding agent interaction patterns." 401 }, 402 { 403 "title": "ReAct: Synergizing reasoning and acting in language models", 404 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"], 405 "year": 2022, 406 "arxiv_id": "2210.03629", 407 "relevance": "Key prompting technique integrating reasoning and acting that is compared as a baseline in multiple AutoGen applications." 408 }, 409 { 410 "title": "The rise and potential of large language model based agents: A survey", 411 "authors": ["Zhiheng Xi", "Wenxiang Chen", "Xin Guo"], 412 "year": 2023, 413 "arxiv_id": "2309.07864", 414 "relevance": "Comprehensive survey on LLM-based agents, providing context for the multi-agent approach taken by AutoGen." 415 }, 416 { 417 "title": "A survey on large language model based autonomous agents", 418 "authors": ["Lei Wang", "Chen Ma", "Xueyang Feng"], 419 "year": 2023, 420 "arxiv_id": "2308.11432", 421 "relevance": "Survey on LLM-based autonomous agents, referenced for broader context on agent capabilities and frameworks." 422 }, 423 { 424 "title": "Self-collaboration code generation via ChatGPT", 425 "authors": ["Yihong Dong", "Xue Jiang", "Zhi Jin", "Ge Li"], 426 "year": 2023, 427 "arxiv_id": "2304.07590", 428 "relevance": "Multi-agent self-collaboration approach for code generation, relevant to understanding collaborative agent patterns for software tasks." 429 }, 430 { 431 "title": "Large language models for supply chain optimization", 432 "authors": ["Beibin Li", "Konstantina Mellou", "Bo Zhang"], 433 "year": 2023, 434 "arxiv_id": "2307.03875", 435 "relevance": "The OptiGuide system that AutoGen re-implements in Application A4, demonstrating multi-agent coding for operations research." 436 }, 437 { 438 "title": "An empirical study on challenging math problem solving with GPT-4", 439 "authors": ["Yiran Wu", "Feiran Jia", "Shaokun Zhang"], 440 "year": 2023, 441 "arxiv_id": "2306.01337", 442 "relevance": "Empirical evaluation of GPT-4 on math problem solving that provides context for AutoGen's math evaluation." 443 }, 444 { 445 "title": "Large language models as tool makers", 446 "authors": ["Tianle Cai", "Xuezhi Wang", "Tengyu Ma"], 447 "year": 2023, 448 "arxiv_id": "2305.17126", 449 "relevance": "Explores LLMs creating and using tools, relevant to the tool-backed agent capabilities in AutoGen." 450 } 451 ] 452 }