scan.json (24252B)
1 { 2 "paper": { 3 "title": "BAMAS: Structuring Budget-Aware Multi-Agent Systems", 4 "authors": [ 5 "Liming Yang", 6 "Junyu Luo", 7 "Xuanzhe Liu", 8 "Yiling Lou", 9 "Zhenpeng Chen" 10 ], 11 "year": 2025, 12 "venue": "AAAI 2026", 13 "arxiv_id": "2511.21572", 14 "doi": null 15 }, 16 "checklist": { 17 "artifacts": { 18 "code_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "The paper states 'We publicly release our code and data at https://github.com/chunfenri/BAMAS, to support further research' in the contributions section." 22 }, 23 "data_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The benchmarks used (GSM8K, MBPP, MATH) are all publicly available standard benchmarks. The paper also states data is released at their GitHub repository." 27 }, 28 "environment_specified": { 29 "applies": true, 30 "answer": false, 31 "justification": "The paper mentions using PuLP library with CBC solver, all-MiniLM-L6-v2 for embeddings, Adam optimizer, and specific LLM providers, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions." 32 }, 33 "reproduction_instructions": { 34 "applies": true, 35 "answer": false, 36 "justification": "The paper provides algorithm pseudocode and hyperparameter tables but does not include step-by-step reproduction instructions or README-level commands to replicate the main experiments. The GitHub link is provided but the paper itself does not contain reproduction instructions." 37 } 38 }, 39 "statistical_methodology": { 40 "confidence_intervals_or_error_bars": { 41 "applies": true, 42 "answer": false, 43 "justification": "All results in Tables 1, 2, and 4 are reported as single point estimates (e.g., '95.3%' accuracy, '542.9' cost) with no confidence intervals or error bars." 44 }, 45 "significance_tests": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper makes comparative claims like 'BAMAS achieves comparable performance while reducing cost by up to 86%' but does not report any statistical significance tests (no p-values, t-tests, or bootstrap tests)." 49 }, 50 "effect_sizes_reported": { 51 "applies": true, 52 "answer": true, 53 "justification": "The paper reports percentage improvements with baseline context, e.g., '62% cost reduction' (542.9 vs 1,425.3), '86% reduction' (529.2 vs 3,735.1), and accuracy comparisons like '95.3% vs. 95.4%'. These provide sufficient context for the reader to assess effect magnitudes." 54 }, 55 "sample_size_justified": { 56 "applies": true, 57 "answer": false, 58 "justification": "The paper uses standard benchmark sizes (GSM8K: 1,319 test, MBPP: 500 test, MATH: 1,000 test) but never justifies why these sample sizes are adequate for the claims being made. For MATH, a stratified sample of 1,000 is used but the justification is 'to manage its size,' not statistical adequacy." 59 }, 60 "variance_reported": { 61 "applies": true, 62 "answer": false, 63 "justification": "The paper mentions using greedy decoding (temperature 0.0) for determinism and setting random seeds, but reports only single-run results. No standard deviation, variance, or spread across multiple runs is reported. The RL training uses a single seed per dataset." 64 } 65 }, 66 "evaluation_design": { 67 "baselines_included": { 68 "applies": true, 69 "answer": true, 70 "justification": "The paper compares against three state-of-the-art multi-agent construction approaches: AutoGen, MetaGPT, and ChatDev. Additionally, a Naive-CostAware heuristic baseline is introduced for ablation." 71 }, 72 "baselines_contemporary": { 73 "applies": true, 74 "answer": true, 75 "justification": "AutoGen (2024), MetaGPT (2024), and ChatDev (2024) are all recent and widely cited multi-agent frameworks, representing the current state of the art." 76 }, 77 "ablation_study": { 78 "applies": true, 79 "answer": true, 80 "justification": "RQ2 presents a component analysis comparing BAMAS against Naive-CostAware, which ablates both the ILP-based LLM provisioning and the RL-based topology selection components. Table 4 shows this comparison." 81 }, 82 "multiple_metrics": { 83 "applies": true, 84 "answer": true, 85 "justification": "The paper reports two metrics: Accuracy (Acc %) for task performance and Average Cost (Avg Cost) for cost efficiency. Out-of-budget task counts are also reported in Table 3." 86 }, 87 "human_evaluation": { 88 "applies": false, 89 "answer": false, 90 "justification": "The paper evaluates on automated benchmarks (GSM8K, MBPP, MATH) where correctness is objectively verifiable. Human evaluation is not relevant for these tasks." 91 }, 92 "held_out_test_set": { 93 "applies": true, 94 "answer": true, 95 "justification": "For all three datasets, the paper clearly separates training data (used for RL training) from test data. GSM8K: first 1,000 training examples for RL, full 1,319 test set for evaluation. MBPP: 374 training problems for RL, 500 test problems. MATH: 1,000 stratified training, 1,000 stratified test." 96 }, 97 "per_category_breakdown": { 98 "applies": true, 99 "answer": true, 100 "justification": "The paper provides per-dataset breakdowns across three different benchmarks (Tables 1, 2), per-budget-level breakdowns, per-topology distributions across datasets and budgets (Figure 3), and out-of-budget counts per dataset/budget (Table 3)." 101 }, 102 "failure_cases_discussed": { 103 "applies": true, 104 "answer": true, 105 "justification": "The paper discusses out-of-budget failures in Table 3 and notes that the Planner-Driven topology is never selected because 'its high cost and instability (since a poor plan can derail the entire process) make it a suboptimal choice.' The risk-averse behavior under low budgets is also discussed." 106 }, 107 "negative_results_reported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The paper reports that the Planner-Driven topology is never selected despite offering the most flexibility. It also shows that at the lowest budget (500), BAMAS achieves lower accuracy than baselines (87.9% vs 95.4%), transparently showing the cost-accuracy trade-off." 111 } 112 }, 113 "claims_and_evidence": { 114 "abstract_claims_supported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The abstract claims 'comparable performance while reducing cost by up to 86%.' This is supported by Table 1 on MBPP where BAMAS at budget 1,250 achieves 82.6% accuracy (comparable to MetaGPT's 82.2%) at cost 529.2 vs 3,735.1, an 86% reduction." 118 }, 119 "causal_claims_justified": { 120 "applies": true, 121 "answer": true, 122 "justification": "The paper's causal claims are primarily from ablation studies (RQ2). The comparison between BAMAS and Naive-CostAware, which removes the ILP provisioning and RL topology selection, constitutes a controlled single-variable manipulation showing the contribution of these components." 123 }, 124 "generalization_bounded": { 125 "applies": true, 126 "answer": false, 127 "justification": "The title claims 'Structuring Budget-Aware Multi-Agent Systems' broadly, but results are tested on only three benchmarks (math word problems and basic Python programming) with only two LLMs (DeepSeek-V3 and GPT-4.1 nano). The paper does not bound its claims to these specific task types or LLM pairs. The conclusion states BAMAS 'achieves a state-of-the-art cost-performance trade-off' without qualification." 128 }, 129 "alternative_explanations_discussed": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper does not discuss alternative explanations for the observed results. For example, it does not consider whether the cost savings are primarily due to using cheaper models (GPT-4.1 nano) rather than the ILP/RL optimization itself, or whether the specific choice of only two LLMs constrains the generality of findings." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper specifies 'DeepSeek-V3' and 'GPT-4.1 nano' with API names 'deepseek-chat' and 'gpt-4.1-nano' in Appendix B, but does not provide snapshot dates or specific API versions. These model behaviors can change across versions." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper describes agent roles (Planner, Executor, Critic) and topology types but does not provide the actual prompts or system instructions used for the LLM agents. The prompts are described only conceptually." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": true, 149 "justification": "Appendix B provides detailed hyperparameters: learning rates (0.0003 for GSM8K, 0.0015 for MATH/MBPP), Adam optimizer, batch size 20,000, 10 training epochs, entropy coefficient 0.001, temperature 0.0, max generation tokens per role per dataset (Table 7), and random seeds." 150 }, 151 "scaffolding_described": { 152 "applies": true, 153 "answer": true, 154 "justification": "The paper describes four collaboration topologies (Linear, Star, Feedback, Planner-Driven) in detail, including role assignments, information flow, and coordination mechanisms. The generate-critique-revise loop for Feedback topology and the planner's dynamic orchestration are explained." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": true, 159 "justification": "The paper describes data preprocessing: GSM8K uses first 1,000 training examples; MBPP uses all 374 training problems; MATH uses stratified sampling by difficulty level and problem type to construct 1,000-problem training and test sets. Cost estimation uses Tin=500 tokens with Tout determined by sampling 50 training instances." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": false, 166 "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion does not discuss any limitations either." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": false, 171 "justification": "No threats to validity are discussed anywhere in the paper. There is no consideration of potential weaknesses in the experimental design." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": false, 176 "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of which task types, LLM configurations, or deployment scenarios are outside the scope of the current evaluation." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": false, 183 "justification": "The paper states code and data are released at GitHub, but the paper itself does not describe making the raw experimental logs, per-example results, or RL training trajectories available for independent verification." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "The paper describes how benchmarks were obtained: GSM8K and MBPP use official training/test splits; MATH uses stratified sampling. RL training data collection is described via the topology selection process with offline experiences." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "No human participants are involved. The data comes from standard public benchmarks (GSM8K, MBPP, MATH)." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": true, 198 "justification": "The pipeline is documented: (1) benchmark selection and splitting, (2) cost estimation via token sampling, (3) ILP-based LLM provisioning, (4) offline RL training data collection with topology execution, (5) policy training, (6) evaluation on held-out test sets. The flow is clear from methodology through evaluation." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": true, 205 "justification": "The Acknowledgement section states: 'This work was supported by the National Natural Science Foundation of China under the grant number 62325201.'" 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "Author affiliations are clearly listed: Peking University, UIUC, Nanyang Technological University, and Tsinghua University. None of the authors are affiliated with DeepSeek or OpenAI, the LLM providers evaluated." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": true, 215 "justification": "The funder is the National Natural Science Foundation of China, a government research funding agency with no commercial stake in the evaluated LLMs or the multi-agent framework." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "There is no competing interests or financial interests statement in the paper." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": true, 226 "answer": false, 227 "justification": "The paper uses DeepSeek-V3 and GPT-4.1 nano to solve benchmark problems (GSM8K, MBPP, MATH), but does not state the training data cutoff dates for either model." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": true, 231 "answer": false, 232 "justification": "GSM8K (2021), MBPP (2021), and MATH (2021) are all public benchmarks that predate the models used. The paper does not discuss whether these benchmarks may have been in the training data of DeepSeek-V3 or GPT-4.1 nano." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": true, 236 "answer": false, 237 "justification": "All three benchmarks (GSM8K, MBPP, MATH) were published in 2021 and are widely known. Both DeepSeek-V3 and GPT-4.1 nano were likely trained on data containing these benchmarks. The paper does not address this contamination risk." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants are involved in this study." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants are involved in this study." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants are involved in this study." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants are involved in this study." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants are involved in this study." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants are involved in this study." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants are involved in this study." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": true, 281 "justification": "Cost reporting is central to the paper. Average costs are reported per task across all experiments (e.g., BAMAS at budget 1,625 has average cost 542.9 on GSM8K). LLM pricing is provided (DeepSeek-V3: $0.27/$1.10 per million tokens in/out; GPT-4.1 nano: $0.10/$0.40)." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": false, 286 "justification": "While per-task LLM API costs are reported, the total computational budget for training the RL topology selection policy, running all experiments across budget levels and baselines, and the total API spend are not stated." 287 } 288 } 289 }, 290 "claims": [ 291 { 292 "claim": "BAMAS achieves comparable performance while reducing cost by up to 86% compared to existing multi-agent construction approaches.", 293 "evidence": "Table 1 shows BAMAS at budget 1,250 achieves 82.6% on MBPP vs MetaGPT's 82.2% at cost 529.2 vs 3,735.1, an 86% reduction. On GSM8K, BAMAS at budget 1,625 achieves 95.3% vs AutoGen's 95.4% at cost 542.9 vs 1,425.3 (62% reduction).", 294 "supported": "strong" 295 }, 296 { 297 "claim": "BAMAS's joint optimization of LLM provisioning and topology selection significantly outperforms greedy baselines.", 298 "evidence": "Table 4 (RQ2) shows BAMAS achieves peak 95.3% accuracy on GSM8K at cost 542.9 vs Naive-CostAware's peak 95.3% at cost 1,650.8. On MBPP, BAMAS achieves higher peak accuracy (82.6% vs 81.6%) at lower cost (529.2 vs 1,379.1).", 299 "supported": "moderate" 300 }, 301 { 302 "claim": "BAMAS learns to adaptively select collaboration topologies based on task type and budget constraints.", 303 "evidence": "Figure 3 (RQ3) shows Feedback topology dominates for math tasks (40.1% GSM8K, 69.8% MATH) while Linear dominates for code tasks (MBPP). Under tight budgets, simpler topologies (Linear, Star) are preferred. Planner-Driven is never selected.", 304 "supported": "moderate" 305 }, 306 { 307 "claim": "BAMAS rarely exceeds the budget constraint during execution.", 308 "evidence": "Table 3 shows 0 out-of-budget tasks on GSM8K across all budgets, 1-5 on MBPP, and at most 30/1,000 (3%) on MATH at budget 2,000.", 309 "supported": "strong" 310 } 311 ], 312 "methodology_tags": [ 313 "benchmark-eval" 314 ], 315 "key_findings": "BAMAS combines Integer Linear Programming for budget-constrained LLM selection with reinforcement learning for collaboration topology selection to build cost-aware multi-agent systems. On three benchmarks (GSM8K, MBPP, MATH), BAMAS matches or exceeds state-of-the-art accuracy while reducing costs by up to 86% compared to AutoGen, MetaGPT, and ChatDev. The RL-trained topology selector learns task-specific preferences (Feedback for math, Linear for code) and exhibits risk-averse behavior under tight budgets by favoring simpler topologies.", 316 "red_flags": [ 317 { 318 "flag": "No limitations section", 319 "detail": "The paper has no limitations, threats-to-validity, or scope-boundaries discussion whatsoever, which is concerning for a paper making broad claims about multi-agent system construction." 320 }, 321 { 322 "flag": "Only two LLMs tested", 323 "detail": "BAMAS is evaluated with only two LLMs (DeepSeek-V3 and GPT-4.1 nano), yet the framework claims to solve the general problem of budget-aware multi-agent system structuring. It is unclear whether the ILP formulation and topology selection would behave similarly with a larger, more diverse LLM pool." 324 }, 325 { 326 "flag": "Benchmark contamination risk", 327 "detail": "All three benchmarks (GSM8K, MBPP, MATH) are from 2021 and widely known. The models used (DeepSeek-V3, GPT-4.1 nano) were likely trained on data containing these benchmarks, but the paper does not acknowledge or address this risk." 328 }, 329 { 330 "flag": "Single-run results with no variance", 331 "detail": "Despite using RL training with random seeds and non-deterministic LLM API calls, results are reported from single runs without error bars, confidence intervals, or variance across seeds." 332 }, 333 { 334 "flag": "No prompts provided", 335 "detail": "The actual prompts given to LLM agents (Planner, Executor, Critic roles) are not provided, making it impossible to reproduce the exact system behavior even with the released code." 336 } 337 ], 338 "cited_papers": [ 339 { 340 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 341 "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang", "Yiran Wu", "Shaokun Zhang", "Erkang Zhu", "Beibin Li", "Li Jiang", "Xiaoyun Zhang", "Chi Wang"], 342 "year": 2024, 343 "relevance": "Major multi-agent framework used as baseline; directly relevant to agentic AI system evaluation." 344 }, 345 { 346 "title": "MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework", 347 "authors": ["Sirui Hong"], 348 "year": 2024, 349 "relevance": "Multi-agent system framework using meta-programming paradigm, used as baseline comparison." 350 }, 351 { 352 "title": "ChatDev: Communicative Agents for Software Development", 353 "authors": ["Chen Qian"], 354 "year": 2024, 355 "relevance": "Multi-agent software development system simulating a virtual company, used as baseline." 356 }, 357 { 358 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 359 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 360 "year": 2024, 361 "relevance": "Directly addresses cost-efficient LLM usage, a core concern of the survey." 362 }, 363 { 364 "title": "Self-Refine: Iterative Refinement with Self-Feedback", 365 "authors": ["Aman Madaan"], 366 "year": 2023, 367 "relevance": "Foundational work on iterative self-refinement in LLM agents, relevant to agentic workflow evaluation." 368 }, 369 { 370 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 371 "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath", "Karthik Narasimhan", "Shunyu Yao"], 372 "year": 2023, 373 "relevance": "Key paper on LLM agent learning through verbal feedback, relevant to agent capability evaluation." 374 }, 375 { 376 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 377 "authors": ["Shunyu Yao"], 378 "year": 2023, 379 "relevance": "Seminal work on combining reasoning and action in LLM agents, foundational to agentic AI." 380 }, 381 { 382 "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models", 383 "authors": ["Shunyu Yao"], 384 "year": 2023, 385 "relevance": "Important prompting/reasoning framework for LLMs, relevant to agent capability evaluation." 386 }, 387 { 388 "title": "Large Language Model-Based Agents for Software Engineering: A Survey", 389 "authors": ["Junjielong Liu", "Kaixin Wang", "Yanjie Chen", "Xinrun Peng", "Zhenpeng Chen", "Lingming Zhang", "Yiling Lou"], 390 "year": 2025, 391 "relevance": "Survey of LLM agents for SE, directly in scope for the agentic AI research landscape." 392 }, 393 { 394 "title": "A Survey on LLM-Based Multi-Agent System: Recent Advances and New Frontiers in Application", 395 "authors": ["Shuaihang Chen"], 396 "year": 2025, 397 "relevance": "Comprehensive survey of LLM multi-agent systems, relevant to understanding the broader field." 398 }, 399 { 400 "title": "Rethinking the Bounds of LLM Reasoning: Are Multi-Agent Discussions the Key?", 401 "authors": ["Qineng Wang"], 402 "year": 2024, 403 "relevance": "Evaluates whether multi-agent discussions improve LLM reasoning, relevant to agent capability claims." 404 }, 405 { 406 "title": "A Unified Approach to Routing and Cascading for LLMs", 407 "authors": ["Jasper Dekoninck", "Maximilian Baader", "Martin Vechev"], 408 "year": 2025, 409 "relevance": "Addresses cost-efficient LLM routing, closely related to budget-aware LLM selection." 410 }, 411 { 412 "title": "Efficient Contextual LLM Cascades through Budget-Constrained Policy Learning", 413 "authors": ["Xiao Zhang"], 414 "year": 2024, 415 "relevance": "Budget-constrained policy learning for LLM cascades, directly relevant to cost-efficient LLM deployment." 416 } 417 ] 418 }