scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21861B)
      1 {
      2   "paper": {
      3     "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
      4     "authors": [
      5       "Qingyun Wu",
      6       "Gagan Bansal",
      7       "Jieyu Zhang",
      8       "Yiran Wu",
      9       "Beibin Li",
     10       "Erkang Zhu",
     11       "Li Jiang",
     12       "Xiaoyun Zhang",
     13       "Shaokun Zhang",
     14       "Jiale Liu",
     15       "Ahmed Awadallah",
     16       "Ryen W. White",
     17       "Doug Burger",
     18       "Chi Wang"
     19     ],
     20     "year": 2023,
     21     "venue": "arXiv",
     22     "arxiv_id": "2308.08155"
     23   },
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "GitHub repository URL provided: https://github.com/microsoft/autogen (footnote 2 in abstract)."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Evaluations use publicly available benchmarks: MATH dataset, Natural Questions, ALFWorld. The 100-task OptiGuide dataset and 12-task group chat tasks are not released."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section. Only mentions pre-installing 'sympy' for math experiments and using specific LLM models."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions, README with commands, or scripts to replicate experiments. The paper describes workflows conceptually but does not provide runnable reproduction steps."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Results in Figures 4a-4d report only point estimates (e.g., '52.5%', '69.48%') with no confidence intervals or error bars."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Claims like 'AutoGen achieves the highest problem-solving success rate' and '15% performance gain' are made by comparing raw numbers without any statistical significance tests."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Percentage improvements with baselines are provided, e.g., 'AutoGen achieves 69.48% vs GPT-4's 55.18%' (Section A1), '15% performance gain' (Section A3), 'multi-agent design boosts F-1 by 8% (GPT-4) and 35% (GPT-3.5)' (Section A4)."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No justification for sample sizes. The 120 level-5 problems, 134 ALFWorld tasks, 100 OptiGuide tasks, and 12 group chat tasks are used without explanation of why these sizes are sufficient."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No standard deviations or variance across runs reported. The qualitative evaluation tests each problem 3 times but reports counts (e.g., '3/3', '2/3') without aggregate variance measures across the full evaluations."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple baselines compared: AutoGPT, ChatGPT+Code Interpreter, ChatGPT+Plugin, vanilla GPT-4, Multi-Agent Debate, LangChain ReAct, DPR (Sections A1-A4)."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Baselines are contemporary (2023): ChatGPT+Code Interpreter, Multi-Agent Debate, LangChain ReAct, MetaGPT, CAMEL. These represent the state of the art at time of writing."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Ablations conducted: interactive retrieval vs without (A2, Figure 4b), 2-agent vs 3-agent with grounding agent (A3, Figure 4c), single-agent vs multi-agent for safeguard (A4, Figure 4d), with vs without board agent (A6)."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Multiple metrics used: success ratio for A1/A3, F1 and Recall for A2/A4. Different metrics appropriate to each task domain."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Qualitative evaluation with manual inspection in Table 2 (A1), where failure reasons are manually analyzed. User experience analysis comparing verbosity and undesired behaviors across systems."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "ALFWorld uses '134 unseen tasks' (Section A3). MATH uses the standard test dataset. Natural Questions uses existing evaluation splits."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "MATH results show only overall accuracy (69.48%) and level-5 subset (52.5%) but no per-category breakdown across the 6 categories mentioned. ALFWorld and other tasks also show only aggregate numbers."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Table 2 provides detailed failure reasons for each system. Section A3 discusses 'occasional inability to leverage basic commonsense knowledge' and 'getting stuck in a loop.' A6 discusses illegitimate moves without board agent."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Reports that BabyAGI, CAMEL, and MetaGPT 'are not suitable choices for solving math problems out of the box' (Appendix D). Also notes AutoGen fails 1/3 on second math problem."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Abstract claims are general ('effectiveness of the framework in many example applications') and are supported by the six applications with empirical results in Section 3 and Appendix D."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Causal claims like 'introducing a grounding agent could bring in a 15% performance gain' and 'multi-agent design boosts F-1 by 8%' are supported by controlled ablation studies (removing/adding specific components)."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper title and abstract claim it is 'a generic framework for building diverse applications of various complexities' but tests on only 6 specific applications. Section 4 acknowledges 'early experimental stages' but the title/abstract are unbounded."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "No discussion of alternative explanations for the results. For example, performance gains could be due to additional compute (more LLM calls) rather than the multi-agent design. The ethics statement and future work section do not address confounds."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Paper says 'GPT-4' and 'GPT-3.5-turbo' without specific version snapshots (e.g., gpt-4-0613). The ALFWorld baseline uses 'text-davinci-003' which is a specific model but the main experiments lack version specificity."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Figure 5 (Appendix C) provides the full default system message for the AssistantAgent. The grounding agent's commonsense prompt is quoted in Section A3. The role-play prompt for speaker selection is mentioned but described in less detail."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "No temperature, top-p, max tokens, or other LLM inference hyperparameters reported for any experiment."
    158       },
    159       "scaffolding_described": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The multi-agent scaffolding is described in detail: Section 2 describes conversable agents, auto-reply mechanisms, conversation programming, GroupChatManager. Figure 2 illustrates the workflow. Each application (A1-A6) describes its specific agent topology."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "For MATH, '120 randomly selected level-5 problems' are used but the selection procedure is not documented. The 100 OptiGuide tasks are 'crafted to include equal numbers of safe and unsafe tasks' but crafting details are absent. The 12 group chat tasks are 'manually crafted' without criteria."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 4 (Discussion) and Appendix B.2 (Future Work) discuss limitations including safety challenges, difficulty of logging/adjusting complex workflows, and risks of fully autonomous agent conversations."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "Limitations are generic: 'fully autonomous agent conversations will need to be used with care,' 'high level of autonomy can also pose potential risks.' No specific threats to the validity of the experimental results are discussed."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No explicit scope boundaries stated. The paper does not specify what the results do NOT show. Section 4 says 'this work is still in its early experimental stages' but does not state specific untested settings or claims not being made."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No raw experimental data (agent conversation logs, individual problem results, timing data) is made available for verification."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "Data collection for custom datasets (100 OptiGuide tasks, 12 group chat tasks) is not described in detail. For benchmark datasets, references are provided but selection criteria for subsets (e.g., 120 MATH problems) are not specified."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants recruited for a study. The human-in-the-loop scenarios are demonstrations, not controlled user studies."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No documentation of the data pipeline from raw benchmark data to final reported results. For example, how the 120 problems were sampled, or how success/failure was determined for each system."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Acknowledgements section mentions 'Qingyun Wu would like to acknowledge the funding and research support from the College of Information Science and Technology at Penn State University.' Microsoft Research affiliation is listed."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations clearly listed: Microsoft Research, Pennsylvania State University, University of Washington, Xidian University. The Microsoft affiliation is relevant since AutoGen is a Microsoft product."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "Most authors are from Microsoft Research, and AutoGen is a Microsoft open-source project (github.com/microsoft/autogen). Microsoft has a direct interest in demonstrating the effectiveness of its own framework."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement. Several authors are Microsoft employees evaluating a Microsoft product, but this conflict is not explicitly acknowledged beyond listing affiliations."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates stated for GPT-4 or GPT-3.5-turbo despite evaluating them on benchmarks like MATH (published 2021) and Natural Questions."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether GPT-4 or GPT-3.5-turbo may have seen MATH, Natural Questions, or ALFWorld problems during training."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "MATH dataset (2021), Natural Questions (2019), and ALFWorld (2021) all predate GPT-4's likely training cutoff. No contamination analysis is provided."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human subjects study conducted. Human-in-the-loop demonstrations are not controlled user studies."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human subjects study requiring IRB approval."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in a study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in a study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human subjects experiment."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human subjects experiment."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human subjects experiment."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No API costs, token counts, or latency reported for any experiment despite making multiple LLM calls per problem across multiple systems."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No total computational budget, API spend, or hardware specifications stated."
    295       }
    296     }
    297   },
    298   "claims": [
    299     {
    300       "claim": "AutoGen achieves 69.48% accuracy on the full MATH test dataset, outperforming vanilla GPT-4 at 55.18%.",
    301       "evidence": "Figure 4a and Appendix D report these numbers on the MATH dataset using GPT-4 as the base model.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "AutoGen's interactive retrieval mechanism significantly improves QA performance over non-interactive retrieval.",
    306       "evidence": "Figure 4b shows F1 improvement from 15.12% to 25.88% and Recall from 58.56% to 66.65% on Natural Questions with GPT-3.5.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Introducing a grounding agent brings a 15% average performance gain on ALFWorld.",
    311       "evidence": "Figure 4c shows 3-agent system at 69% average vs 2-agent at 54% on 134 unseen ALFWorld tasks.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "Multi-agent design boosts F-1 score for unsafe code detection by 8% (GPT-4) and 35% (GPT-3.5) over single-agent.",
    316       "evidence": "Figure 4d shows multi-agent vs single-agent F1 scores on 100 OptiGuide coding tasks.",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "AutoGen reduces OptiGuide core workflow code from 430 lines to 100 lines.",
    321       "evidence": "Section A4 states this claim but no code comparison or verification is provided.",
    322       "supported": "weak"
    323     }
    324   ],
    325   "methodology_tags": [
    326     "benchmark-eval",
    327     "case-study"
    328   ],
    329   "key_findings": "AutoGen is an open-source multi-agent conversation framework that enables building LLM applications through customizable, conversable agents and conversation programming. Empirical evaluations across six applications (math problem solving, RAG, decision-making, multi-agent coding, dynamic group chat, conversational chess) show performance improvements over baselines like vanilla GPT-4, LangChain ReAct, and Multi-Agent Debate. The framework demonstrates that multi-agent architectures with specialized agents (e.g., grounding agents, safeguard agents) can meaningfully improve task success rates compared to single-agent approaches.",
    330   "red_flags": [
    331     {
    332       "flag": "Vendor self-evaluation",
    333       "detail": "Most authors are Microsoft Research employees evaluating AutoGen, a Microsoft open-source project. This conflict of interest is not explicitly acknowledged beyond listing affiliations."
    334     },
    335     {
    336       "flag": "No uncertainty quantification",
    337       "detail": "All results are reported as point estimates without confidence intervals, error bars, standard deviations, or significance tests. Claims of superiority rest entirely on raw number comparisons."
    338     },
    339     {
    340       "flag": "Benchmark contamination risk unaddressed",
    341       "detail": "GPT-4 is evaluated on MATH (2021), Natural Questions (2019), and ALFWorld (2021), all of which predate GPT-4's training. No contamination analysis is provided."
    342     },
    343     {
    344       "flag": "Small or ad-hoc evaluation sets",
    345       "detail": "Several evaluations use small or custom datasets: 12 manually crafted group chat tasks, qualitative evaluation on only 2 math problems tested 3 times each, 100 'crafted' OptiGuide tasks with no description of crafting methodology."
    346     },
    347     {
    348       "flag": "No cost comparison",
    349       "detail": "Multi-agent systems make more LLM calls than single-agent baselines, but no cost, latency, or token usage is reported. Performance gains may come at disproportionate computational cost."
    350     }
    351   ],
    352   "cited_papers": [
    353     {
    354       "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
    355       "authors": ["Sirui Hong"],
    356       "year": 2023,
    357       "arxiv_id": "2308.00352",
    358       "relevance": "Multi-agent framework for software development, direct competitor to AutoGen in multi-agent LLM applications."
    359     },
    360     {
    361       "title": "CAMEL: Communicative Agents for 'Mind' Exploration of Large Scale Language Model Society",
    362       "authors": ["Guohao Li"],
    363       "year": 2023,
    364       "relevance": "Pioneering multi-agent role-playing framework, compared with AutoGen in Table 1."
    365     },
    366     {
    367       "title": "Encouraging Divergent Thinking in Large Language Models through Multi-Agent Debate",
    368       "authors": ["Tian Liang"],
    369       "year": 2023,
    370       "relevance": "Multi-agent debate approach for improving LLM reasoning, used as baseline in AutoGen evaluations."
    371     },
    372     {
    373       "title": "Improving Factuality and Reasoning in Language Models through Multiagent Debate",
    374       "authors": ["Yilun Du"],
    375       "year": 2023,
    376       "arxiv_id": "2305.14325",
    377       "relevance": "Multi-agent debate for factuality improvement, foundational work for multi-agent LLM approaches."
    378     },
    379     {
    380       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    381       "authors": ["Shunyu Yao"],
    382       "year": 2022,
    383       "arxiv_id": "2210.03629",
    384       "relevance": "Foundational agent prompting technique combining reasoning and acting, used as baseline."
    385     },
    386     {
    387       "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models",
    388       "authors": ["Guanzhi Wang"],
    389       "year": 2023,
    390       "arxiv_id": "2305.16291",
    391       "relevance": "LLM-based agent for open-ended exploration, relevant to agentic AI capabilities."
    392     },
    393     {
    394       "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    395       "authors": ["Joon Sung Park"],
    396       "year": 2023,
    397       "arxiv_id": "2304.03442",
    398       "relevance": "Multi-agent simulation with LLM-powered agents, influential work in agentic AI."
    399     },
    400     {
    401       "title": "The Rise and Potential of Large Language Model Based Agents: A Survey",
    402       "authors": ["Zhiheng Xi"],
    403       "year": 2023,
    404       "arxiv_id": "2309.07864",
    405       "relevance": "Comprehensive survey of LLM-based agents, provides broader context for multi-agent frameworks."
    406     },
    407     {
    408       "title": "A Survey on Large Language Model Based Autonomous Agents",
    409       "authors": ["Lei Wang"],
    410       "year": 2023,
    411       "arxiv_id": "2308.11432",
    412       "relevance": "Survey of autonomous LLM agents, complementary to multi-agent conversation frameworks."
    413     },
    414     {
    415       "title": "Large Language Models as Tool Makers",
    416       "authors": ["Tianle Cai"],
    417       "year": 2023,
    418       "arxiv_id": "2305.17126",
    419       "relevance": "LLM tool-use capabilities relevant to agent frameworks and agentic AI."
    420     },
    421     {
    422       "title": "An Empirical Study on Challenging Math Problem Solving with GPT-4",
    423       "authors": ["Yiran Wu"],
    424       "year": 2023,
    425       "arxiv_id": "2306.01337",
    426       "relevance": "Empirical evaluation of GPT-4 on math problems, directly related to AutoGen's math evaluation."
    427     }
    428   ]
    429 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs