scan.json (24159B)
1 { 2 "paper": { 3 "title": "AGENTSNET: Coordination and Collaborative Reasoning in Multi-Agent LLMs", 4 "authors": [ 5 "Florian Grötschla", 6 "Luis Müller", 7 "Jan Tönshoff", 8 "Mikhail Galkin", 9 "Bryan Perozzi" 10 ], 11 "year": 2025, 12 "venue": "Preprint", 13 "arxiv_id": "2507.08616" 14 }, 15 "checklist": { 16 "artifacts": { 17 "code_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The paper provides a GitHub repository at https://github.com/floriangroetschla/AgentsNet (Section 5.1, Implementation). Open-source code is explicitly stated." 21 }, 22 "data_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "The dataset is released on HuggingFace at https://huggingface.co/datasets/disco-eth/AgentsNet (Section 5.1, Implementation)." 26 }, 27 "environment_specified": { 28 "applies": true, 29 "answer": false, 30 "justification": "The paper mentions using LangChain and NetworkX as implementation frameworks but does not provide a requirements.txt, Dockerfile, or detailed version specifications for all dependencies." 31 }, 32 "reproduction_instructions": { 33 "applies": true, 34 "answer": false, 35 "justification": "No step-by-step reproduction instructions are included in the paper itself. The code is available on GitHub but no 'Reproducing Results' section or README instructions are described in the paper." 36 } 37 }, 38 "statistical_methodology": { 39 "confidence_intervals_or_error_bars": { 40 "applies": true, 41 "answer": true, 42 "justification": "Table 2 reports standard error of the mean in parentheses for all results (e.g., '0.14 (0.04)'). Figure 1 explicitly notes 'Error bars indicate standard error of the mean.'" 43 }, 44 "significance_tests": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper makes comparative claims ('best performing models are Claude 3.7 Sonnet, Gemini 2.5 Pro, and Gemini 2.5 Flash'; 'Gemini 2.5 Flash is roughly on par with Claude 3.7') but uses no significance tests (p-values, t-tests, bootstrap, etc.) to support these comparisons." 48 }, 49 "effect_sizes_reported": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper reports raw benchmark scores but never explicitly computes or frames relative differences as effect sizes. Comparisons like 'roughly on par' and 'cheaper by a factor of 20' are made informally without formal effect size measures." 53 }, 54 "sample_size_justified": { 55 "applies": true, 56 "answer": false, 57 "justification": "27 network topologies with at least one repeat per graph are used (Section 5.1). There is no power analysis or explicit justification for why 27 graphs and the number of repeats are sufficient for the precision of results." 58 }, 59 "variance_reported": { 60 "applies": true, 61 "answer": true, 62 "justification": "Table 2 reports standard error of the mean for all task-model combinations (values in parentheses). The paper references Miller [32] as the statistical methodology basis." 63 } 64 }, 65 "evaluation_design": { 66 "baselines_included": { 67 "applies": true, 68 "answer": true, 69 "justification": "Multiple frontier LLM models are compared against each other on the benchmark, including Claude, GPT, Gemini, Llama 4, and o4-mini variants (Table 2). These serve as mutual baselines." 70 }, 71 "baselines_contemporary": { 72 "applies": true, 73 "answer": true, 74 "justification": "All evaluated models are 2024-2025 frontier models (Claude 3.5 Haiku, Claude 3.7 Sonnet, GPT-4.1 mini, Gemini 2.5 Pro, Llama 4, o4-mini), representing the state of the art at time of publication." 75 }, 76 "ablation_study": { 77 "applies": true, 78 "answer": false, 79 "justification": "No ablation study is reported to analyze the contribution of individual components (e.g., number of message-passing rounds, chain-of-thought prompting). The paper evaluates different models on the benchmark but does not ablate benchmark design choices." 80 }, 81 "multiple_metrics": { 82 "applies": true, 83 "answer": true, 84 "justification": "The paper uses binary (fully correct) evaluation as the main metric and also discusses and reports soft evaluation scores in Appendix B, providing a continuous measure of solution quality." 85 }, 86 "human_evaluation": { 87 "applies": false, 88 "answer": false, 89 "justification": "Human evaluation of system outputs is clearly irrelevant to this benchmark paper, which evaluates LLM agents on structured mathematical problems with objective ground truth." 90 }, 91 "held_out_test_set": { 92 "applies": true, 93 "answer": true, 94 "justification": "Graphs are generated procedurally from distributions; no model tuning is performed on them. The evaluation set is generated independently and used only for testing, so there is no development/test confound." 95 }, 96 "per_category_breakdown": { 97 "applies": true, 98 "answer": true, 99 "justification": "Figure 4 provides a per-task and per-graph-size breakdown for each model, showing performance on CONSENSUS, MATCHING, LEADER ELECTION, COLORING, and VERTEX COVER separately at 4, 8, and 16 nodes." 100 }, 101 "failure_cases_discussed": { 102 "applies": true, 103 "answer": true, 104 "justification": "Section 5.4 provides qualitative analysis with three findings about failure modes: strategy coordination failures, agents accepting erroneous information, and conflict resolution. Appendix E provides additional transcript examples." 105 }, 106 "negative_results_reported": { 107 "applies": true, 108 "answer": true, 109 "justification": "The paper explicitly shows that performance drops severely for all models on 100-agent networks (near zero), and many models perform poorly on VERTEXCOVER and COLORING tasks. These are negative results reported candidly." 110 } 111 }, 112 "claims_and_evidence": { 113 "abstract_claims_supported": { 114 "applies": true, 115 "answer": true, 116 "justification": "Abstract claims are supported: 'some frontier LLMs demonstrate strong performance for small networks but fall off as network scales' is borne out in Table 2 and Figures 4-5. The claim about scaling to 100 agents is demonstrated in Figure 5." 117 }, 118 "causal_claims_justified": { 119 "applies": false, 120 "answer": false, 121 "justification": "The paper makes no causal claims — it is a benchmark paper that reports performance scores. No language like 'improves', 'causes', or 'leads to' is used to draw causal inferences from the benchmark results." 122 }, 123 "generalization_bounded": { 124 "applies": true, 125 "answer": true, 126 "justification": "Claims are appropriately scoped to the specific models and graph distributions tested. Results are presented as benchmark performance on specific tasks, not generalized to all multi-agent systems or all coordination problems." 127 }, 128 "alternative_explanations_discussed": { 129 "applies": true, 130 "answer": false, 131 "justification": "The qualitative analysis in Section 5.4 describes failure patterns but does not systematically discuss alternative explanations for why certain models fail. For example, whether failures stem from context length limits, prompt formatting issues, or fundamental reasoning gaps is not distinguished." 132 } 133 }, 134 "setup_transparency": { 135 "model_versions_specified": { 136 "applies": true, 137 "answer": true, 138 "justification": "Table 3 in Appendix A explicitly lists API providers and exact model versions for all evaluated models (e.g., claude-3-5-haiku-20241022, claude-3-7-sonnet-20250219, gpt-4.1-mini, gemini-2.5-flash-preview-04-17)." 139 }, 140 "prompts_provided": { 141 "applies": true, 142 "answer": true, 143 "justification": "The full system prompt template is provided in Appendix A (Section 'Full System Prompt'), including all template variables. Task-specific prompt texts ([task1] and [task2]) for each task are provided in Appendix B, and examples are given in Section 4." 144 }, 145 "hyperparameters_reported": { 146 "applies": true, 147 "answer": false, 148 "justification": "The paper does not report LLM API hyperparameters such as temperature or top-p settings used during evaluation. These are omitted from both the main paper and appendices." 149 }, 150 "scaffolding_described": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 4 and Appendix A describe the message-passing protocol in detail, including the synchronous round structure, JSON formatting requirements, retry logic for invalid JSON, and Algorithm 1 pseudocode. This constitutes a thorough scaffolding description." 154 }, 155 "data_preprocessing_documented": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 5.1 describes graph generation: 27 topologies, 3 per graph size (4, 8, 16 nodes) per distribution type (small-world, scale-free, Delaunay), 3 graphs per combination. Appendix D details the three graph models." 159 } 160 }, 161 "limitations_and_scope": { 162 "limitations_section_present": { 163 "applies": true, 164 "answer": true, 165 "justification": "Section 6 is titled 'Limitations' and discusses the LOCAL model's synchronous message-passing constraint and JSON parsing issues. Appendix G provides further elaboration." 166 }, 167 "threats_to_validity_specific": { 168 "applies": true, 169 "answer": true, 170 "justification": "Appendix G discusses specific threats: (1) synchronous communication model limits ecological validity for real-world transfer, (2) binary metric may obscure partial progress, (3) homogeneous agents miss heterogeneous deployment challenges, (4) no adversarial/faulty agents tested. These are specific to this study's design choices." 171 }, 172 "scope_boundaries_stated": { 173 "applies": true, 174 "answer": false, 175 "justification": "While the paper is appropriately scoped to its benchmark tasks, it does not explicitly state what the results do NOT show (e.g., that performance on AGENTSNET does not predict performance on real-world multi-agent tasks)." 176 } 177 }, 178 "data_integrity": { 179 "raw_data_available": { 180 "applies": true, 181 "answer": true, 182 "justification": "The dataset is released on HuggingFace at https://huggingface.co/datasets/disco-eth/AgentsNet, allowing independent verification of the benchmark instances." 183 }, 184 "data_collection_described": { 185 "applies": true, 186 "answer": true, 187 "justification": "Graph generation is procedurally described in Section 5.1 and Appendix D, including the three graph models (small-world, scale-free, Delaunay) and size parameters. The generation process is reproducible from the code." 188 }, 189 "recruitment_methods_described": { 190 "applies": false, 191 "answer": false, 192 "justification": "No human participants are involved; the study uses procedurally generated graphs and LLM API calls. Recruitment is not applicable." 193 }, 194 "data_pipeline_documented": { 195 "applies": true, 196 "answer": true, 197 "justification": "The full pipeline from graph generation to model evaluation is described: graph sampling, agent instantiation, message-passing rounds, response extraction, and scoring. Algorithm 1 provides pseudocode." 198 } 199 }, 200 "conflicts_of_interest": { 201 "funding_disclosed": { 202 "applies": true, 203 "answer": false, 204 "justification": "No acknowledgments or funding disclosure section is present in the paper. Two authors are affiliated with Google Research, which produces Gemini models evaluated in the benchmark." 205 }, 206 "affiliations_disclosed": { 207 "applies": true, 208 "answer": true, 209 "justification": "Author affiliations are listed on the title page: Florian Grötschla (ETH Zurich), Luis Müller and Jan Tönshoff (RWTH Aachen), Mikhail Galkin and Bryan Perozzi (Google Research). The Google affiliation is visible." 210 }, 211 "funder_independent_of_outcome": { 212 "applies": true, 213 "answer": false, 214 "justification": "Two of five authors are from Google Research, which develops Gemini models — which are evaluated in the benchmark. This represents a potential non-independent relationship between researchers and outcomes, and no acknowledgment of this conflict is made." 215 }, 216 "financial_interests_declared": { 217 "applies": true, 218 "answer": false, 219 "justification": "There is no competing interests statement in the paper. The Google Research affiliation of two authors and their potential interest in Gemini model performance is not explicitly declared." 220 } 221 }, 222 "contamination": { 223 "training_cutoff_stated": { 224 "applies": true, 225 "answer": false, 226 "justification": "The paper evaluates LLM models on the benchmark but does not state the training data cutoff dates for any of the models evaluated." 227 }, 228 "train_test_overlap_discussed": { 229 "applies": true, 230 "answer": false, 231 "justification": "No discussion of whether the benchmark tasks or graph structures could appear in training data. However, because tasks are procedurally generated, contamination risk is likely low — but this is not discussed." 232 }, 233 "benchmark_contamination_addressed": { 234 "applies": true, 235 "answer": false, 236 "justification": "The benchmark is novel and the paper claims it was created in 2025, but no analysis is provided of whether similar distributed computing problems or graph structures exist in training data. The paper does not address this." 237 } 238 }, 239 "human_studies": { 240 "pre_registered": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants are involved in this benchmark evaluation paper." 244 }, 245 "irb_or_ethics_approval": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants are involved in this benchmark evaluation paper." 249 }, 250 "demographics_reported": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants are involved in this benchmark evaluation paper." 254 }, 255 "inclusion_exclusion_criteria": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved in this benchmark evaluation paper." 259 }, 260 "randomization_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved in this benchmark evaluation paper." 264 }, 265 "blinding_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved in this benchmark evaluation paper." 269 }, 270 "attrition_reported": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants are involved in this benchmark evaluation paper." 274 } 275 }, 276 "cost_and_practicality": { 277 "inference_cost_reported": { 278 "applies": true, 279 "answer": true, 280 "justification": "Figure 1 plots model performance versus API cost per repeat in USD (as of May 15, 2025), showing costs ranging from ~$1 to ~$200 per repeat. Pareto-optimal cost-performance models are identified." 281 }, 282 "compute_budget_stated": { 283 "applies": true, 284 "answer": false, 285 "justification": "While per-repeat API costs are shown in Figure 1, the total computational budget for the full benchmark (total number of API calls, total cost) is not explicitly stated." 286 } 287 } 288 }, 289 "claims": [ 290 { 291 "claim": "Existing multi-agent benchmarks cover at most 2-5 agents, while AGENTSNET scales to practically unlimited network sizes, with experiments involving up to 100 coordinating agents.", 292 "evidence": "Abstract and Section 1 state this; Figure 5 shows Gemini 2.0 Flash performance on networks of 20-100 agents.", 293 "supported": "strong" 294 }, 295 { 296 "claim": "Some frontier LLMs demonstrate strong performance for small networks (4 nodes) but performance degrades as network size increases.", 297 "evidence": "Table 2 and Figure 4 show per-task performance broken down by graph size (4, 8, 16 nodes). Figure 5 shows near-zero performance at 100 nodes for all tasks.", 298 "supported": "strong" 299 }, 300 { 301 "claim": "Claude 3.7 Sonnet, Gemini 2.5 Pro, and Gemini 2.5 Flash are the best performing models overall on AGENTSNET.", 302 "evidence": "Table 2 shows AGENTSNET aggregate scores: Claude 3.7 Sonnet (0.70), Gemini 2.5 Pro (0.80), Gemini 2.5 Flash (0.69), all higher than other models.", 303 "supported": "strong" 304 }, 305 { 306 "claim": "Gemini 2.5 Flash achieves roughly comparable performance to Claude 3.7 Sonnet at approximately 1/20th the API cost.", 307 "evidence": "Figure 1 plots cost vs. performance, showing Gemini 2.5 Flash as Pareto-optimal relative to Claude 3.7 Sonnet. AGENTSNET scores are 0.69 vs. 0.70 respectively.", 308 "supported": "moderate" 309 }, 310 { 311 "claim": "Strategy coordination is an essential challenge: agents sometimes fail to agree on strategies or agree too late in the message-passing rounds.", 312 "evidence": "Section 5.4, Finding 1 describes this pattern with transcript analysis. Appendix E contains full qualitative examples.", 313 "supported": "moderate" 314 } 315 ], 316 "methodology_tags": [ 317 "benchmark-eval" 318 ], 319 "key_findings": "AGENTSNET is a new multi-agent benchmark built on five distributed computing problems (graph coloring, vertex cover, maximal matching, leader election, consensus) that evaluates LLM agents' ability to coordinate through synchronous message-passing on graph topologies. Frontier models perform reasonably on small networks (4-16 nodes) but performance collapses near zero for 100-agent networks across all tasks. Among evaluated models, Gemini 2.5 Pro achieves the highest overall score (0.80), while Gemini 2.5 Flash offers the best cost-performance tradeoff. Qualitative analysis reveals key failure modes: poor strategy coordination, acceptance of erroneous neighbor information, and difficulty maintaining global consistency through purely local communication.", 320 "red_flags": [ 321 { 322 "flag": "Conflict of interest: Google Research authors evaluate Gemini models", 323 "detail": "Two of five authors (Mikhail Galkin and Bryan Perozzi) are affiliated with Google Research, which develops the Gemini family of models. Gemini models (2.0 Flash, 2.5 Flash, 2.5 Flash Thinking, 2.5 Pro) constitute 4 of the 10 evaluated models. No competing interests statement is present, and no funding is disclosed." 324 }, 325 { 326 "flag": "No hyperparameters reported", 327 "detail": "Temperature and other sampling hyperparameters for LLM API calls are not reported anywhere in the paper. These significantly affect output variability for multi-agent coordination tasks." 328 }, 329 { 330 "flag": "Limited number of repeats per graph", 331 "detail": "The paper uses 'at least one repeat per graph' for the main results (Table 2), with some cells showing as few as 5-6 samples total. The standard errors for some model-task combinations are large (e.g., 0.09 for Gemini 2.5 Flash on VERTEXCOVER), suggesting high variance with insufficient repetitions." 332 }, 333 { 334 "flag": "No ablation of message-passing rounds", 335 "detail": "The number of message-passing rounds is set heuristically (2D+1 for global tasks, size-based for local tasks). No ablation studies examine whether different numbers of rounds significantly affect results, making it unclear whether performance differences between models reflect coordination ability or sensitivity to round count." 336 } 337 ], 338 "cited_papers": [ 339 { 340 "title": "Agentverse: Facilitating multi-agent collaboration and exploring emergent behaviors", 341 "authors": [ 342 "Chen, W.", 343 "Su, Y.", 344 "Zuo, J." 345 ], 346 "year": 2024, 347 "relevance": "Multi-agent LLM collaboration framework directly relevant to the survey scope on agentic AI systems." 348 }, 349 { 350 "title": "GPTSwarm: Language agents as optimizable graphs", 351 "authors": [ 352 "Zhuge, M.", 353 "Wang, W.", 354 "Kirsch, L.", 355 "Faccio, F.", 356 "Khizbullin, D.", 357 "Schmidhuber, J." 358 ], 359 "year": 2024, 360 "relevance": "Graph-based approach to organizing LLM agents that motivates the AGENTSNET benchmark design." 361 }, 362 { 363 "title": "AgentBench: Evaluating LLMs as agents", 364 "authors": [ 365 "Liu, X.", 366 "Yu, H.", 367 "Zhang, H." 368 ], 369 "year": 2024, 370 "relevance": "A major multi-agent benchmark that AGENTSNET is explicitly positioned as complementary to." 371 }, 372 { 373 "title": "LLM-Coordination: Evaluating and analyzing multi-agent coordination abilities in large language models", 374 "authors": [ 375 "Agashe, S.", 376 "Fan, Y.", 377 "Reyna, A.", 378 "Wang, X. E." 379 ], 380 "year": 2024, 381 "arxiv_id": "2310.03903", 382 "relevance": "Directly evaluates multi-agent coordination in LLMs, a core topic for this survey." 383 }, 384 { 385 "title": "Improving factuality and reasoning in language models through multiagent debate", 386 "authors": [ 387 "Du, Y.", 388 "Li, S.", 389 "Torralba, A.", 390 "Tenenbaum, J. B.", 391 "Mordatch, I." 392 ], 393 "year": 2023, 394 "relevance": "Seminal work on multi-agent debate as a method to improve LLM reasoning quality." 395 }, 396 { 397 "title": "Scaling large-language-model-based multi-agent collaboration", 398 "authors": [ 399 "Qian, C.", 400 "Xie, Z.", 401 "Wang, Y." 402 ], 403 "year": 2024, 404 "arxiv_id": "2406.07155", 405 "relevance": "Studies scaling properties of multi-agent LLM collaboration, directly relevant to AGENTSNET's scalability focus." 406 }, 407 { 408 "title": "Are emergent abilities of large language models a mirage?", 409 "authors": [ 410 "Schaeffer, R.", 411 "Miranda, B.", 412 "Koyejo, S." 413 ], 414 "year": 2023, 415 "relevance": "Methodological paper on emergent behaviors in LLMs; cited for its relevance to discontinuous evaluation metrics used in AGENTSNET." 416 }, 417 { 418 "title": "Adding error bars to evals: A statistical approach to language model evaluations", 419 "authors": [ 420 "Miller, E." 421 ], 422 "year": 2024, 423 "arxiv_id": "2411.00640", 424 "relevance": "Provides statistical methodology for LLM evaluation used directly in this paper's analysis." 425 }, 426 { 427 "title": "Generative agents: Interactive simulacra of human behavior", 428 "authors": [ 429 "Park, J. S.", 430 "O'Brien, J.", 431 "Cai, C. J.", 432 "Morris, M. R.", 433 "Liang, P.", 434 "Bernstein, M. S." 435 ], 436 "year": 2023, 437 "relevance": "Foundational paper on LLM-based agent systems that motivates multi-agent research." 438 }, 439 { 440 "title": "Talk like a graph: Encoding graphs for large language models", 441 "authors": [ 442 "Fatemi, B.", 443 "Halcrow, J.", 444 "Perozzi, B." 445 ], 446 "year": 2024, 447 "relevance": "Studies graph reasoning in LLMs, directly relevant to AGENTSNET's graph-based evaluation tasks." 448 }, 449 { 450 "title": "Tau-bench: A benchmark for tool-agent-user interaction in real-world domains", 451 "authors": [ 452 "Yao, S.", 453 "Shinn, N.", 454 "Razavi, P.", 455 "Narasimhan, K." 456 ], 457 "year": 2024, 458 "arxiv_id": "2406.12045", 459 "relevance": "Another agentic benchmark that AGENTSNET is positioned alongside." 460 } 461 ] 462 }