scan.json (25802B)
1 { 2 "paper": { 3 "title": "MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework", 4 "authors": ["Sirui Hong", "Mingchen Zhuge", "Jiaqi Chen", "Xiawu Zheng", "Yuheng Cheng", "Ceyao Zhang", "Jinlin Wang", "Zili Wang", "Steven Ka Shing Yau", "Zijuan Lin", "Liyang Zhou", "Chenyu Ran", "Lingfeng Xiao", "Chenglin Wu", "Jürgen Schmidhuber"], 5 "year": 2023, 6 "venue": "ICLR 2024", 7 "arxiv_id": "2308.00352" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "checklist": { 12 "artifacts": { 13 "code_released": { 14 "applies": true, 15 "answer": true, 16 "justification": "GitHub link provided: https://github.com/geekan/MetaGPT, mentioned in the abstract." 17 }, 18 "data_released": { 19 "applies": true, 20 "answer": false, 21 "justification": "HumanEval and MBPP are public benchmarks (YES for those), but the SoftwareDev dataset of 70 tasks is described in Table 8 with only 11 example prompts shown. The full dataset is not released with a download link." 22 }, 23 "environment_specified": { 24 "applies": true, 25 "answer": false, 26 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided in the paper. Only general mentions of Python libraries (Tkinter, Pillow) in demo examples." 27 }, 28 "reproduction_instructions": { 29 "applies": true, 30 "answer": false, 31 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but the paper itself does not include commands or a reproducing results section." 32 } 33 }, 34 "statistical_methodology": { 35 "confidence_intervals_or_error_bars": { 36 "applies": true, 37 "answer": false, 38 "justification": "Main results in Figure 4 and Tables 1, 4 report point estimates only. Table 7 reports std dev for GPT variant experiments, but the main MetaGPT results have no uncertainty measures." 39 }, 40 "significance_tests": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper claims MetaGPT 'outperforms' baselines based on comparing numbers (e.g., 85.9% vs 67.0%) without any statistical significance test." 44 }, 45 "effect_sizes_reported": { 46 "applies": true, 47 "answer": true, 48 "justification": "The paper reports improvements with baseline context, e.g., '4.2% and 5.4% in Pass@1 on HumanEval and MBPP' for the feedback mechanism, and provides baseline vs. system scores throughout (e.g., GPT-4 67.0% vs MetaGPT 85.9%)." 49 }, 50 "sample_size_justified": { 51 "applies": true, 52 "answer": false, 53 "justification": "The SoftwareDev evaluation uses only 7 tasks for the main comparison (Table 1, Table 4). No justification for why 7 tasks suffice. The full dataset has 70 tasks but only 7 are used for baselines comparison." 54 }, 55 "variance_reported": { 56 "applies": true, 57 "answer": false, 58 "justification": "Table 7 reports std dev for GPT variant experiments on HumanEval, but the main MetaGPT results (Figure 4, Tables 1, 4) show single-run numbers with no variance across runs." 59 } 60 }, 61 "evaluation_design": { 62 "baselines_included": { 63 "applies": true, 64 "answer": true, 65 "justification": "Multiple baselines included: AutoGPT, LangChain, AgentVerse, ChatDev for SoftwareDev; AlphaCode, Incoder, CodeGeeX, CodeGen, Codex, CodeT, PaLM, GPT-4 for HumanEval/MBPP." 66 }, 67 "baselines_contemporary": { 68 "applies": true, 69 "answer": true, 70 "justification": "Baselines include GPT-4 (2023), ChatDev (2023), AgentVerse (2023), which were contemporary at submission time." 71 }, 72 "ablation_study": { 73 "applies": true, 74 "answer": true, 75 "justification": "Table 3 shows an ablation study on roles (adding Product Manager, Architect, Project Manager incrementally). The executable feedback mechanism is also ablated (MetaGPT w/o Feedback vs MetaGPT)." 76 }, 77 "multiple_metrics": { 78 "applies": true, 79 "answer": true, 80 "justification": "Pass@1 for HumanEval/MBPP; Executability, Cost, Code Statistics, Productivity, Human Revision Cost for SoftwareDev (Table 1)." 81 }, 82 "human_evaluation": { 83 "applies": true, 84 "answer": true, 85 "justification": "Executability scores (1-4) and Human Revision Cost are evaluated by humans for the SoftwareDev benchmark (Section 4.1 Evaluation Metrics: 'human evaluations (A, E)')." 86 }, 87 "held_out_test_set": { 88 "applies": true, 89 "answer": true, 90 "justification": "HumanEval and MBPP are standard held-out test benchmarks. The SoftwareDev dataset is newly created by the authors." 91 }, 92 "per_category_breakdown": { 93 "applies": true, 94 "answer": true, 95 "justification": "Table 4 shows per-task executability scores for 7 individual tasks. Table 9 provides detailed per-task statistics for 11 tasks." 96 }, 97 "failure_cases_discussed": { 98 "applies": true, 99 "answer": true, 100 "justification": "Table 9 lists specific failure modes per task (e.g., 'TypeError', 'PNG file missing', 'dependency error', 'model training method not implement'). Section D.1 discusses system limitations." 101 }, 102 "negative_results_reported": { 103 "applies": true, 104 "answer": true, 105 "justification": "Table 5 shows MetaGPT with Deepseek Coder 33B achieving only 1.4 executability (near failure). The ablation in Table 3 shows that removing roles degrades performance." 106 } 107 }, 108 "claims_and_evidence": { 109 "abstract_claims_supported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Abstract claims 'generates more coherent solutions than previous chat-based multi-agent systems' is supported by Tables 1 and 4. State-of-the-art on HumanEval/MBPP is supported by Figure 4." 113 }, 114 "causal_claims_justified": { 115 "applies": true, 116 "answer": true, 117 "justification": "Causal claims like 'SOPs improve collaboration' are supported by controlled ablations (Table 3 — adding roles incrementally) and the feedback mechanism ablation. Single-variable manipulation in ablation design is adequate." 118 }, 119 "generalization_bounded": { 120 "applies": true, 121 "answer": false, 122 "justification": "The title claims 'Multi-Agent Collaborative Framework' broadly but evaluation is only on Python code generation tasks (HumanEval, MBPP, SoftwareDev). The abstract says 'collaborative software engineering benchmarks' but scope is limited to GPT-4 backend, Python, and specific task types." 123 }, 124 "alternative_explanations_discussed": { 125 "applies": true, 126 "answer": false, 127 "justification": "No discussion of alternative explanations for results. The improvement could partly be due to GPT-4's inherent capability rather than the SOP framework, or to the additional token budget used. These confounds are not addressed." 128 }, 129 "proxy_outcome_distinction": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper measures Pass@1 and executability on toy tasks but frames results as evidence for 'collaborative software engineering' capability. No discussion of whether these benchmarks capture real software engineering complexity." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper says 'GPT-4' and 'GPT-3.5-Turbo' without version snapshots in the main experiments. Table 7 mentions 'gpt-4-0613' and 'gpt-3.5-turbo-0613' but only for the supplementary HumanEval variant experiment, not for the main MetaGPT results." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": false, 144 "justification": "Prompts are described conceptually (role definitions, SOP structure) but the actual prompt text sent to GPT-4 for each role is not provided. Appendix B shows demo outputs but not the prompts that generated them." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": false, 149 "justification": "No temperature, top-p, max tokens, or other API parameters are reported for the main experiments." 150 }, 151 "scaffolding_described": { 152 "applies": true, 153 "answer": true, 154 "justification": "The agentic scaffolding is described in detail: role definitions (Section 3.1), communication protocol with publish-subscribe (Section 3.2), executable feedback mechanism (Section 3.3), with workflow diagrams (Figures 1-3)." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": false, 159 "justification": "For HumanEval/MBPP: 'we slightly modified the prompts to align with response format requirements' but the modifications are not described. For SoftwareDev: task selection criteria for the 7 comparison tasks are described as 'randomly select seven representative tasks' without documenting the selection." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section D.1 'Limitation' discusses system-side and human-user-side limitations." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": false, 171 "justification": "The limitations in D.1 are about capability gaps ('cannot fully cater to specific scenarios, such as UI and front-end') rather than threats to the validity of the evaluation. No discussion of whether the results could be invalid or misleading." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": false, 176 "justification": "No explicit statements about what the results do NOT show. The paper does not bound its claims to specific settings, languages, or task types." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": false, 183 "justification": "No raw experimental outputs (generated code, logs, intermediate artifacts) are released for independent verification." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "HumanEval and MBPP are well-described standard benchmarks. The SoftwareDev dataset construction is described in Section 4.1 and Table 8 with task descriptions and scope (Figure 5)." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "No human participants recruited. Human evaluators are mentioned for executability scoring but their selection/identity is not described — however this is a benchmark eval, not a human subjects study. NA since data sources are standard benchmarks and a custom dataset." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": false, 198 "justification": "The pipeline from raw model outputs to final scores is not documented. How executability was scored, who scored it, and inter-rater reliability are not described." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": false, 205 "justification": "No funding section or grant acknowledgments found. The acknowledgment section thanks individuals for editing help but does not disclose funding sources." 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "Author affiliations are clearly listed: DeepWisdom, KAUST, Xiamen University, CUHK-Shenzhen, etc. Chenglin Wu is identified as CEO of DeepWisdom." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": false, 215 "justification": "DeepWisdom is a company that initiated and hosts MetaGPT. The CEO is an author. The paper evaluates their own product. This is a clear non-independent funding/ownership relationship, not disclosed as a conflict." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests statement. Chenglin Wu is CEO of DeepWisdom which hosts MetaGPT and later launched AgentStore (Appendix A.2), but no financial interest declaration is provided." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": true, 226 "answer": false, 227 "justification": "The paper uses GPT-4 on HumanEval and MBPP but does not state GPT-4's training data cutoff date." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": true, 231 "answer": false, 232 "justification": "HumanEval was published in 2021 and MBPP in 2021. GPT-4 could have seen these benchmarks during training. No discussion of this overlap." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": true, 236 "answer": false, 237 "justification": "HumanEval and MBPP have been publicly available since 2021, well before GPT-4's training. No contamination analysis is provided." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants study. Human evaluators scored executability but this is not a human subjects study." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human subjects study conducted." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": true, 281 "justification": "Table 1 reports token usage (24,613 / 31,255) and running time (503s / 541s). Table 3 reports expense per run (e.g., $0.915 to $1.385). Table 9 reports money costs per task." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": true, 286 "justification": "Tables 1, 3, and 9 report running times, token usage, and dollar costs. Table 5 reports time per run for different backends." 287 } 288 }, 289 "experimental_rigor": { 290 "seed_sensitivity_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "Main MetaGPT results are single-run. Table 7 reports 5 runs with std dev for GPT variant experiments, but the core MetaGPT vs. baseline comparisons have no seed sensitivity analysis." 294 }, 295 "number_of_runs_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "The number of runs for the main MetaGPT experiments is not stated. Table 7 states '5 times' for the GPT variant experiment only." 299 }, 300 "hyperparameter_search_budget": { 301 "applies": true, 302 "answer": false, 303 "justification": "No hyperparameter search budget reported. The maximum 3 retries for executable feedback appears tuned but no search budget is described." 304 }, 305 "best_config_selection_justified": { 306 "applies": true, 307 "answer": false, 308 "justification": "No justification for why the specific configuration (roles, retry limit, prompt design) was chosen. The ablation shows incremental roles but doesn't explain how the final 4-role + feedback configuration was selected." 309 }, 310 "multiple_comparison_correction": { 311 "applies": false, 312 "answer": false, 313 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 314 }, 315 "self_comparison_bias_addressed": { 316 "applies": true, 317 "answer": false, 318 "justification": "Authors compare their system (MetaGPT) against their own re-implementations/runs of baselines (AutoGPT, LangChain, AgentVerse, ChatDev) without acknowledging self-comparison bias." 319 }, 320 "compute_budget_vs_performance": { 321 "applies": true, 322 "answer": false, 323 "justification": "MetaGPT uses significantly more tokens than ChatDev (31,255 vs 19,292) but this compute difference is not discussed as a confound for the performance comparison." 324 }, 325 "benchmark_construct_validity": { 326 "applies": true, 327 "answer": false, 328 "justification": "No discussion of whether HumanEval/MBPP (single-function problems) actually measure multi-agent software engineering capability. The SoftwareDev benchmark validity is not questioned." 329 }, 330 "scaffold_confound_addressed": { 331 "applies": true, 332 "answer": false, 333 "justification": "MetaGPT uses a sophisticated multi-agent scaffold while baselines use simpler scaffolds. The paper attributes improvement to the SOP design but does not control for the overall scaffold complexity/token budget difference." 334 } 335 }, 336 "data_leakage": { 337 "temporal_leakage_addressed": { 338 "applies": true, 339 "answer": false, 340 "justification": "HumanEval (2021) and MBPP (2021) predate GPT-4's training. No discussion of temporal leakage." 341 }, 342 "feature_leakage_addressed": { 343 "applies": true, 344 "answer": false, 345 "justification": "No discussion of whether the evaluation setup leaks answer information. The multi-pass design (PRD → design → code → test) could interact with model memorization, but this is not addressed." 346 }, 347 "non_independence_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of whether HumanEval/MBPP problems share structure with GPT-4's training data." 351 }, 352 "leakage_detection_method": { 353 "applies": true, 354 "answer": false, 355 "justification": "No leakage detection or prevention method is applied." 356 } 357 } 358 }, 359 "claims": [ 360 { 361 "claim": "MetaGPT achieves 85.9% and 87.7% Pass@1 on HumanEval and MBPP respectively, state-of-the-art performance.", 362 "evidence": "Figure 4 shows Pass@1 comparisons. Prior best was Codex+CodeT at 65.8%/67.7% and GPT-4 at 67.0%.", 363 "supported": "moderate" 364 }, 365 { 366 "claim": "MetaGPT achieves 100% task completion rate on the SoftwareDev benchmark.", 367 "evidence": "Mentioned in Section 1 but Table 4 shows scores ranging from 3 to 4 (not all perfect), with Flappy Bird at 3. The 100% completion claim appears to mean 'produces output' not 'works perfectly'.", 368 "supported": "weak" 369 }, 370 { 371 "claim": "The executable feedback mechanism improves Pass@1 by 4.2% on HumanEval and 5.4% on MBPP.", 372 "evidence": "Figure 4 compares MetaGPT w/o Feedback (81.7%, 82.3%) vs MetaGPT (85.9%, 87.7%). Table 1 shows executability improvement from 3.67 to 3.75.", 373 "supported": "moderate" 374 }, 375 { 376 "claim": "MetaGPT outperforms ChatDev on the SoftwareDev dataset in nearly all metrics.", 377 "evidence": "Table 1 shows MetaGPT scores 3.75 vs ChatDev 2.25 executability, 0.83 vs 2.5 human revision cost, 124.3 vs 248.9 productivity ratio.", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "SOPs significantly improve multi-agent collaboration and reduce hallucination cascading.", 382 "evidence": "Table 3 ablation shows adding roles improves executability (1.0 → 4.0) and reduces revisions (10 → 2.5). Qualitative argument about structured communication but no direct hallucination measurement.", 383 "supported": "moderate" 384 } 385 ], 386 "methodology_tags": ["benchmark-eval"], 387 "key_findings": "MetaGPT introduces a multi-agent framework encoding software development SOPs into LLM prompt sequences, achieving 85.9% and 87.7% Pass@1 on HumanEval and MBPP. On a custom SoftwareDev benchmark of 70 tasks, MetaGPT scores 3.75/4.0 executability vs ChatDev's 2.25, while requiring fewer human revisions (0.83 vs 2.5). Ablations show that adding specialized roles and executable feedback both contribute to improved performance, though the system uses significantly more tokens than baselines.", 388 "red_flags": [ 389 { 390 "flag": "Company evaluating its own product", 391 "detail": "DeepWisdom CEO (Chenglin Wu) is a corresponding author. DeepWisdom initiated MetaGPT and later monetized it through AgentStore. The paper evaluates their own framework with no independent evaluation or conflict disclosure." 392 }, 393 { 394 "flag": "Tiny evaluation sample for main claims", 395 "detail": "The SoftwareDev comparison uses only 7 tasks despite 70 being available. Table 4 shows per-task scores but N=7 is very small for the claims made. The full 70-task results (Table 9) are only shown for MetaGPT w/o feedback, not for baselines." 396 }, 397 { 398 "flag": "No variance or multiple runs for main results", 399 "detail": "Core HumanEval/MBPP results and SoftwareDev comparisons appear to be single runs. LLM outputs are stochastic; single-run comparisons are unreliable." 400 }, 401 { 402 "flag": "Benchmark contamination risk", 403 "detail": "HumanEval and MBPP were published in 2021. GPT-4 was likely trained on data including these benchmarks. The multi-agent pipeline may amplify memorized solutions through structured prompting, inflating scores beyond what the framework itself contributes." 404 }, 405 { 406 "flag": "Unfair compute comparison", 407 "detail": "MetaGPT uses 31,255 tokens vs ChatDev's 19,292 (~62% more). The additional tokens buy more opportunities for correction. Performance improvement may partly reflect token budget rather than architectural innovation." 408 }, 409 { 410 "flag": "100% completion rate claim is misleading", 411 "detail": "The abstract and Section 1 claim '100% task completion rate' but Table 4 shows Flappy Bird scores only 3/4 (not flawless). The claim appears to mean 'produces some output' which is a very low bar." 412 } 413 ], 414 "cited_papers": [ 415 { 416 "title": "Communicative Agents for Software Development", 417 "authors": ["Chen Qian", "Xin Cong", "Cheng Yang", "Weize Chen"], 418 "year": 2023, 419 "relevance": "ChatDev: main baseline, multi-agent framework for code generation using chat-based collaboration." 420 }, 421 { 422 "title": "CAMEL: Communicative Agents for 'Mind' Exploration of Large Scale Language Model Society", 423 "authors": ["Guohao Li"], 424 "year": 2023, 425 "relevance": "Role-playing multi-agent framework; closest prior work on multi-agent collaboration for programming." 426 }, 427 { 428 "title": "Evaluating Large Language Models Trained on Code", 429 "authors": ["Mark Chen", "Jerry Tworek"], 430 "year": 2021, 431 "relevance": "Introduces HumanEval benchmark and Codex; primary evaluation benchmark used in this paper." 432 }, 433 { 434 "title": "Program Synthesis with Large Language Models", 435 "authors": ["Jacob Austin", "Augustus Odena"], 436 "year": 2021, 437 "relevance": "Introduces MBPP benchmark; second primary evaluation benchmark used." 438 }, 439 { 440 "title": "Generative Agents: Interactive Simulacra of Human Behavior", 441 "authors": ["Joon Sung Park"], 442 "year": 2023, 443 "relevance": "Foundational multi-agent simulation work; MetaGPT draws analogy to simulated towns." 444 }, 445 { 446 "title": "Reflexion: An Autonomous Agent with Dynamic Memory and Self-Reflection", 447 "authors": ["Noah Shinn", "Beck Labash", "Ashwin Gopinath"], 448 "year": 2023, 449 "relevance": "Self-reflection mechanism for LLM agents; related to MetaGPT's executable feedback." 450 }, 451 { 452 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 453 "authors": ["Shunyu Yao"], 454 "year": 2022, 455 "relevance": "ReAct-style reasoning loop used as the base agent behavior pattern in MetaGPT." 456 }, 457 { 458 "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models", 459 "authors": ["Guanzhi Wang"], 460 "year": 2023, 461 "relevance": "Open-ended LLM agent with self-improvement; MetaGPT draws parallel to Minecraft sandbox." 462 }, 463 { 464 "title": "AgentVerse: Facilitating Multi-Agent Collaboration and Exploring Emergent Behaviors in Agents", 465 "authors": ["Weize Chen"], 466 "year": 2023, 467 "relevance": "Multi-agent collaboration framework; direct baseline comparison in SoftwareDev evaluation." 468 }, 469 { 470 "title": "Self-collaboration Code Generation via ChatGPT", 471 "authors": ["Yihong Dong"], 472 "year": 2023, 473 "relevance": "Multi-persona self-collaboration for code generation; related multi-agent approach." 474 }, 475 { 476 "title": "Competition-Level Code Generation with AlphaCode", 477 "authors": ["Yujia Li"], 478 "year": 2022, 479 "relevance": "Competition-level code generation baseline compared in HumanEval results." 480 }, 481 { 482 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 483 "authors": ["Jason Wei"], 484 "year": 2022, 485 "relevance": "CoT prompting foundation; MetaGPT's SOP design compared to injecting CoT in LLMs." 486 } 487 ] 488 }