scan.json (26283B)
1 { 2 "paper": { 3 "title": "MaCTG: Multi-Agent Collaborative Thought Graph for Automatic Programming", 4 "authors": [ 5 "Zixiao Zhao", 6 "Jing Sun", 7 "Zhe Hou", 8 "Zhiyuan Wei", 9 "Cheng-Hao Cai", 10 "Miao Qiao", 11 "Jin Song Dong" 12 ], 13 "year": 2024, 14 "venue": "arXiv preprint", 15 "arxiv_id": "2410.19245" 16 }, 17 "checklist": { 18 "artifacts": { 19 "code_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "The paper provides a GitHub link: https://github.com/MaCTG2025/MaCTG (stated in the abstract, line 70)." 23 }, 24 "data_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The BCVPP dataset is described in detail, and the GitHub repository is provided which presumably contains or links to it. Additionally, the simple projects are sourced from the publicly available 'OpenCV_Projects' repository [34]. The paper states the project can be found at the GitHub link." 28 }, 29 "environment_specified": { 30 "applies": true, 31 "answer": false, 32 "justification": "The paper mentions hardware (AMD Ryzen Threadripper PRO 3995WX, 252GB RAM, two NVIDIA RTX A6000 GPUs) and Ubuntu 22.04, but does not provide a requirements.txt, Dockerfile, or detailed dependency list with library versions sufficient to recreate the environment." 33 }, 34 "reproduction_instructions": { 35 "applies": true, 36 "answer": false, 37 "justification": "No step-by-step reproduction instructions are provided in the paper itself. The GitHub repository is referenced but the paper does not contain a 'Reproducing Results' section or commands to run." 38 } 39 }, 40 "statistical_methodology": { 41 "confidence_intervals_or_error_bars": { 42 "applies": true, 43 "answer": false, 44 "justification": "Results in Table 1 are reported as point estimates (e.g., '83.33%') with no confidence intervals or error bars." 45 }, 46 "significance_tests": { 47 "applies": true, 48 "answer": false, 49 "justification": "The paper claims MaCTG outperforms baselines based on comparing accuracy numbers without any statistical significance tests (no p-values, t-tests, or similar)." 50 }, 51 "effect_sizes_reported": { 52 "applies": true, 53 "answer": true, 54 "justification": "The paper reports percentage improvements with baseline context, e.g., '89.09% cost reduction compared to existing multi-agent frameworks' and the ablation study shows specific accuracy drops (e.g., '14% on medium tasks and 20% on hard tasks' when removing context-aware planning). Table 1 provides full baseline-context numbers." 55 }, 56 "sample_size_justified": { 57 "applies": true, 58 "answer": false, 59 "justification": "The BCVPP dataset comprises only 90 projects (30 simple, 50 medium, 10 hard). No justification is given for why these numbers are sufficient, and the 10 hard projects in particular are very small for drawing conclusions. No power analysis is discussed." 60 }, 61 "variance_reported": { 62 "applies": true, 63 "answer": false, 64 "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single runs with no indication of multiple experimental runs." 65 } 66 }, 67 "evaluation_design": { 68 "baselines_included": { 69 "applies": true, 70 "answer": true, 71 "justification": "Table 1 includes comparisons against open-source models (CodeLLaMA-Instruct-70B, LLaMA3.3-70B, Qwen2.5-72B, LLaMA3.2-Vision-90B), proprietary tools/models (Copilot Workspace, OpenAI-o1, OpenAI-o3-mini, Claude3.7-Sonnet, DeepSeek-R1), and multi-agent frameworks (ChatDev, MetaGPT)." 72 }, 73 "baselines_contemporary": { 74 "applies": true, 75 "answer": true, 76 "justification": "Baselines include very recent models: OpenAI-o3-mini-2025-01-31, Claude 3.7 Sonnet, DeepSeek-R1, LLaMA3.3-70B, Qwen2.5-72B. These are contemporary and competitive as of 2024-2025." 77 }, 78 "ablation_study": { 79 "applies": true, 80 "answer": true, 81 "justification": "Section 4.3.2 presents ablation studies removing multi-scale evaluation, context-aware planning adjustment, and replacing collaborative reasoning with independent reasoning. Results are shown in Table 1." 82 }, 83 "multiple_metrics": { 84 "applies": true, 85 "answer": false, 86 "justification": "The paper uses only a single metric: execution accuracy (whether the output matches expected output). No other metrics such as code quality, compilation rate, or partial correctness are reported." 87 }, 88 "human_evaluation": { 89 "applies": true, 90 "answer": false, 91 "justification": "No human evaluation of the generated code is included. Evaluation is entirely automated via output matching. Human evaluation could assess code quality, readability, or maintainability of the generated projects." 92 }, 93 "held_out_test_set": { 94 "applies": true, 95 "answer": false, 96 "justification": "There is no mention of a separate dev/test split. The entire BCVPP dataset of 90 projects appears to be the test set, with no indication that any portion was used for tuning or development." 97 }, 98 "per_category_breakdown": { 99 "applies": true, 100 "answer": true, 101 "justification": "Table 1 provides per-difficulty breakdowns (Simple, Medium, Hard) for all methods, showing performance variation across task complexity levels." 102 }, 103 "failure_cases_discussed": { 104 "applies": true, 105 "answer": true, 106 "justification": "The paper discusses failure patterns including cascading hallucinations, inconsistencies between modules (Figure 4), variable type mismatches, and incorrect file references. Section 5 discusses limitations including execution speed and stability issues." 107 }, 108 "negative_results_reported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The ablation study reports configurations that performed worse. The 'MaCTG with independent reasoning' variant scored only 44.44% overall. The paper also reports that removing context-aware planning led to 14% drop on medium and 20% drop on hard tasks." 112 } 113 }, 114 "claims_and_evidence": { 115 "abstract_claims_supported": { 116 "applies": true, 117 "answer": true, 118 "justification": "The abstract claims 83.33% accuracy and 89.09% cost reduction. Table 1 confirms 83.33% overall accuracy. Table 2 shows MaCTG cost of $0.61 vs. ChatDev GPT-3.5 at $5.81, supporting the 89.09% reduction claim (compared to the most expensive multi-agent framework, though the comparison base is selectively chosen)." 119 }, 120 "causal_claims_justified": { 121 "applies": true, 122 "answer": true, 123 "justification": "The paper makes causal claims about components improving performance (e.g., 'removing multi-scale evaluation resulted in a substantial accuracy drop'). These are supported by controlled ablation experiments where single components are removed, which is an adequate design for such claims." 124 }, 125 "generalization_bounded": { 126 "applies": true, 127 "answer": false, 128 "justification": "The title claims 'Automatic Programming' broadly, but the evaluation is exclusively on image processing tasks using OpenCV in Python. The abstract does not bound claims to this domain. While the paper mentions the medical imaging motivation in Section 4.1, the broad title and abstract language ('auto-programming tasks', 'real-world applicability') extend well beyond what was tested." 129 }, 130 "alternative_explanations_discussed": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper does not discuss alternative explanations for MaCTG's performance gains. For example, the performance advantage could partly stem from using DeepSeek-V3 (a very strong model) rather than the collaborative framework itself. No confound analysis is provided." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper specifies model names like 'DeepSeek-V3', 'Qwen2.5-Coder-7B', 'GPT-4o', 'OpenAI-o1-2024-12-17', and 'OpenAI-o3-mini-2025-01-31'. However, for DeepSeek-V3, no specific version/snapshot date is given beyond the model name. 'Claude3.7-sonnet' lacks a snapshot date. 'GPT-4o' used for ChatDev and MetaGPT has no version suffix. The OpenAI o1 and o3-mini entries do include date versions." 141 }, 142 "prompts_provided": { 143 "applies": true, 144 "answer": false, 145 "justification": "The paper describes agent roles and specifications in natural language (Section 3.1.1) but does not provide the actual prompt text used for any of the agents. The prompts are described conceptually ('we prompt it to minimize the number of modules') without providing the full prompt templates or actual text." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": false, 150 "justification": "No hyperparameters are reported for the LLM API calls (temperature, top-p, max tokens, etc.) or for the locally deployed Qwen2.5-Coder-7B model." 151 }, 152 "scaffolding_described": { 153 "applies": true, 154 "answer": true, 155 "justification": "The agentic scaffolding is described in substantial detail in Section 3: the five agent types (Team Leader, Module Leader, Function Coordinator, Coder, Tester), their interactions, the graph-based reasoning protocol, context-sharing mechanism, supervised adjustment, multi-scale validation, and assembly process are all documented with diagrams (Figures 1-5)." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 4.1 documents the dataset construction process: 30 simple projects selected from the OpenCV_Projects repository, 100 medium projects generated by OpenAI-o1 with calibration that resulted in 50 being finalized, and 10 hard projects manually identified. The filtering criteria (e.g., medium projects 'lacked coherent and logical relationships' and were reviewed/refined) are described." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 5 (Discussion) contains a 'Threats to validity' subsection that discusses limitations including execution speed and stability." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": false, 172 "justification": "The threats to validity are brief and somewhat generic: (1) execution speed is slow due to locally deployed models, and (2) the dynamic agent structure introduces variability. These are more operational limitations than threats to the validity of the experimental findings. No discussion of threats like the narrow domain (only image processing), the small sample size (especially 10 hard projects), the lack of multiple runs, or potential confounds." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": false, 177 "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion bounding claims to Python/OpenCV/image processing, no statement about what populations of programming tasks are excluded, and no explicit limitations on generalizability claims." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": true, 184 "justification": "The GitHub repository (https://github.com/MaCTG2025/MaCTG) is provided. The BCVPP dataset projects and source data from the OpenCV_Projects repository are referenced, making the raw data potentially available for verification." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section 4.1 describes how data was collected: 30 simple projects from the OpenCV_Projects GitHub repository, 100 medium projects generated by OpenAI-o1 (refined to 50 after calibration), and 10 hard projects manually identified. The sources and rationale are described." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants are involved. The study evaluates automated programming systems on a benchmark dataset." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 4.1 documents the pipeline: selection from OpenCV_Projects repository (30 simple) → generation via OpenAI-o1 (100 medium candidates) → calibration and filtering (50 medium finalized) → manual identification (10 hard). The filtering rationale (incoherent project descriptions) and final counts are provided." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": false, 206 "justification": "No funding information or acknowledgments section is present in the paper." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Author affiliations are clearly listed: University of Auckland, Griffith University, Beijing Institute of Technology, Suzhou Industrial Park Monash Research Institute, National University of Singapore. None appear to be affiliated with the model providers (DeepSeek, Qwen/Alibaba)." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure means this criterion cannot be verified." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests or financial interests statement is present in the paper." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": true, 227 "answer": false, 228 "justification": "The paper does not state the training data cutoff dates for DeepSeek-V3 or Qwen2.5-Coder-7B. This is relevant because the BCVPP dataset includes projects from a public GitHub repository that could be in the training data." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": true, 232 "answer": false, 233 "justification": "No discussion of whether the OpenCV_Projects repository content (used for simple projects) or similar image processing code might appear in the training data of DeepSeek-V3 or Qwen2.5-Coder. The OpenCV_Projects repo has 340+ stars and is publicly available, making contamination plausible." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": true, 237 "answer": false, 238 "justification": "The BCVPP dataset includes 30 simple projects from a public GitHub repository (OpenCV_Projects, with 340+ stars). This repository was available before the training cutoffs of the models used. No contamination analysis is provided for any of the evaluated models." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants are involved in this study." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants are involved in this study." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants are involved in this study." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants are involved in this study." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants are involved in this study." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants are involved in this study." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants are involved in this study." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": true, 282 "justification": "Table 2 reports the total API cost for evaluating on BCVPP: MaCTG costs $0.61, compared to ChatDev GPT-3.5 at $5.81, ChatDev GPT-4o at $7.83, MetaGPT GPT-4o at $5.59. The costs are directly collected from API platforms." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": true, 287 "justification": "Hardware configuration is specified (AMD Ryzen Threadripper PRO 3995WX, 252GB RAM, two NVIDIA RTX A6000 GPUs) and API costs are reported. However, total GPU hours or wall-clock time for the experiments are not stated. The hardware and API cost together provide a partial picture." 288 } 289 } 290 }, 291 "claims": [ 292 { 293 "claim": "MaCTG achieves state-of-the-art accuracy of 83.33% on the BCVPP auto-programming benchmark.", 294 "evidence": "Table 1 shows MaCTG achieving 90.00% on simple, 78.00% on medium, and 80.00% on hard tasks, for 83.33% overall. The next best method is DeepSeek-R1 at 73.33%.", 295 "supported": "moderate" 296 }, 297 { 298 "claim": "MaCTG reduces operational costs by 89.09% compared to existing multi-agent frameworks.", 299 "evidence": "Table 2 shows MaCTG costs $0.61 vs. ChatDev GPT-4o at $7.83 (highest multi-agent cost). The 89.09% figure is calculated against ChatDev GPT-3.5 ($5.81), not the most expensive framework.", 300 "supported": "moderate" 301 }, 302 { 303 "claim": "Context-aware planning adjustment reduces hallucinations and improves accuracy, with 14% drop on medium and 20% drop on hard tasks when removed.", 304 "evidence": "Ablation in Table 1: MaCTG without context-aware planning achieves 76.67/62.00/30.00 vs. full MaCTG at 90.00/78.00/80.00. The 14% and 20% drop figures for medium and hard are confirmed (78-62=16, not 14 as claimed; 80-30=50, not 20 as claimed). The stated figures do not match the table.", 305 "supported": "weak" 306 }, 307 { 308 "claim": "Collaborative reasoning outperforms independent agent reasoning, with the independent reasoning variant scoring only 44.44% overall.", 309 "evidence": "Table 1 shows MaCTG with independent reasoning achieving 70.00/32.00/30.00 (44.44% overall) vs. full MaCTG at 83.33%. The difference is substantial.", 310 "supported": "moderate" 311 }, 312 { 313 "claim": "MaCTG outperforms both open-source and proprietary methods across various task difficulties.", 314 "evidence": "Table 1 confirms MaCTG (83.33%) outperforms all baselines including DeepSeek-R1 (73.33%), OpenAI-o1 (67.78%), and Claude 3.7 Sonnet (64.44%). However, results are from single runs on a small dataset (90 projects, only 10 hard) with no variance or significance testing.", 315 "supported": "moderate" 316 } 317 ], 318 "methodology_tags": [ 319 "benchmark-eval" 320 ], 321 "key_findings": "MaCTG, a multi-agent framework using a dynamic graph structure for collaborative reasoning, achieved 83.33% execution accuracy on the BCVPP image processing auto-programming benchmark, outperforming both individual LLMs and existing multi-agent frameworks. The hybrid model configuration (DeepSeek-V3 for planning, Qwen2.5-Coder-7B for execution) reduced API costs to $0.61 for the full benchmark evaluation. Ablation studies showed that both the context-aware planning adjustment and multi-scale validation components contribute to performance, with the collaborative reasoning approach substantially outperforming independent agent reasoning (83.33% vs 44.44%).", 322 "red_flags": [ 323 { 324 "flag": "Very small benchmark with no variance reporting", 325 "detail": "The BCVPP dataset has only 90 projects (30 simple, 50 medium, 10 hard). With only 10 hard projects, the 80% accuracy for MaCTG represents 8/10 correct. Single-run results with no error bars or multiple trials make it impossible to assess result stability." 326 }, 327 { 328 "flag": "Narrow evaluation domain presented as general", 329 "detail": "The evaluation is exclusively on Python image processing tasks using OpenCV, yet the title and abstract claim broad 'automatic programming' capability and 'real-world applicability'. No evidence supports generalization beyond this narrow domain." 330 }, 331 { 332 "flag": "Self-constructed benchmark evaluated only by authors", 333 "detail": "The BCVPP benchmark was constructed by the authors (with 50 medium projects generated by OpenAI-o1 and curated by the authors). There is no independent validation of benchmark quality or difficulty, and the construction process could introduce biases favorable to MaCTG's design." 334 }, 335 { 336 "flag": "No contamination analysis despite using public code", 337 "detail": "The simple projects come from a public GitHub repository with 340+ stars. All evaluated models may have seen this code during training. No contamination analysis is provided for any model." 338 }, 339 { 340 "flag": "Ablation study numbers inconsistent with text", 341 "detail": "The paper claims removing context-aware planning adjustment reduced accuracy by '14% on medium tasks and 20% on hard tasks', but Table 1 shows drops of 16% (78-62) and 50% (80-30) respectively. This arithmetic discrepancy undermines confidence in the reported analysis." 342 }, 343 { 344 "flag": "Cost comparison uses selective baseline", 345 "detail": "The 89.09% cost reduction claim compares MaCTG ($0.61) against ChatDev GPT-3.5 ($5.81), which is not the most expensive framework. Additionally, the local deployment of Qwen2.5-Coder-7B incurs GPU costs that are not included in the $0.61 figure, making the comparison asymmetric." 346 }, 347 { 348 "flag": "No statistical significance tests", 349 "detail": "All comparative claims are based on raw accuracy differences without any statistical testing, despite the small sample size making random variation a serious concern." 350 } 351 ], 352 "cited_papers": [ 353 { 354 "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework", 355 "authors": ["Sirui Hong", "Xiawu Zheng", "Jonathan Chen"], 356 "year": 2023, 357 "arxiv_id": "2308.00352", 358 "relevance": "Major multi-agent framework baseline for LLM-based software development, directly compared in experiments." 359 }, 360 { 361 "title": "ChatDev: Communicative Agents for Software Development", 362 "authors": ["Chen Qian", "Wei Liu", "Hongzhang Liu"], 363 "year": 2024, 364 "relevance": "Multi-agent code generation framework used as baseline comparison in Table 1." 365 }, 366 { 367 "title": "AgentCoder: Multi-Agent-Based Code Generation with Iterative Testing and Optimisation", 368 "authors": ["Dong Huang", "Qingwen Bu", "Jie M Zhang"], 369 "year": 2023, 370 "arxiv_id": "2312.13010", 371 "relevance": "Multi-agent code generation approach with testing agent integration, related work in multi-agent programming." 372 }, 373 { 374 "title": "Self-collaboration code generation via ChatGPT", 375 "authors": ["Yihong Dong", "Xue Jiang", "Zhi Jin", "Ge Li"], 376 "year": 2023, 377 "arxiv_id": "2304.07590", 378 "relevance": "Early multi-agent code generation approach using role-based LLM collaboration." 379 }, 380 { 381 "title": "CodeAgent: Enhancing Code Generation with Tool-Integrated Agent Systems for Real-World Repo-Level Coding Challenges", 382 "authors": ["Kechi Zhang", "Jia Li", "Ge Li"], 383 "year": 2024, 384 "arxiv_id": "2401.07339", 385 "relevance": "Agent-based code generation system integrating external tools for repository-level tasks." 386 }, 387 { 388 "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models", 389 "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"], 390 "year": 2023, 391 "arxiv_id": "2305.10601", 392 "relevance": "Foundational reasoning strategy for LLMs that MaCTG's graph-based approach builds upon." 393 }, 394 { 395 "title": "Graph of Thoughts: Solving Elaborate Problems with Large Language Models", 396 "authors": ["Maciej Besta", "Nils Blach", "Ales Kubicek"], 397 "year": 2024, 398 "relevance": "Graph-based reasoning approach for LLMs that directly inspired MaCTG's collaborative thought graph structure." 399 }, 400 { 401 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 402 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"], 403 "year": 2022, 404 "arxiv_id": "2210.03629", 405 "relevance": "Reasoning and acting framework for LLM agents, used as inspiration for MaCTG's two-step planning process." 406 }, 407 { 408 "title": "A Survey on Large Language Model Based Autonomous Agents", 409 "authors": ["Lei Wang", "Chen Ma", "Xueyang Feng"], 410 "year": 2024, 411 "relevance": "Comprehensive survey of LLM-based autonomous agents covering agent frameworks and interaction strategies." 412 }, 413 { 414 "title": "Evaluating Large Language Models in Class-Level Code Generation", 415 "authors": ["Xueying Du", "Mingwei Liu", "Kaixin Wang"], 416 "year": 2024, 417 "relevance": "Evaluation of LLMs on project-level code generation tasks, directly related to MaCTG's problem domain." 418 }, 419 { 420 "title": "A Survey on Large Language Models for Code Generation", 421 "authors": ["Juyong Jiang", "Fan Wang", "Jiasi Shen"], 422 "year": 2024, 423 "arxiv_id": "2406.00515", 424 "relevance": "Survey of LLM code generation capabilities including discussion of model selection and agent-based approaches." 425 }, 426 { 427 "title": "DeepSeek-V3 Technical Report", 428 "authors": ["DeepSeek-AI"], 429 "year": 2024, 430 "arxiv_id": "2412.19437", 431 "relevance": "Technical report for the primary proprietary model used in MaCTG's planner agents." 432 } 433 ] 434 }