scan.json (25933B)
1 { 2 "paper": { 3 "title": "Thought Communication in Multiagent Collaboration", 4 "authors": ["Yujia Zheng", "Zhuokai Zhao", "Zijian Li", "Yaqi Xie", "Mingze Gao", "Lizhu Zhang", "Kun Zhang"], 5 "year": 2025, 6 "venue": "NeurIPS 2025", 7 "arxiv_id": "2510.20733", 8 "doi": "10.48550/arXiv.2510.20733" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": false, 17 "justification": "No repository URL or code archive is provided in the paper. The paper references the baseline code (Multiagent Finetuning GitHub) but does not release its own THOUGHTCOMM implementation." 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "The experiments use publicly available benchmarks: MATH and GSM8K. The synthetic data is generated from described random processes." 23 }, 24 "environment_specified": { 25 "applies": true, 26 "answer": false, 27 "justification": "The paper mentions '8 NVIDIA H100 GPUs' (Appendix C.1) but provides no requirements.txt, library versions, or environment setup details." 28 }, 29 "reproduction_instructions": { 30 "applies": true, 31 "answer": false, 32 "justification": "No step-by-step reproduction instructions, README, or scripts are provided." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": true, 38 "answer": true, 39 "justification": "Table 1 reports ± standard deviation for all accuracy scores (e.g., '85.00 ± 1.60')." 40 }, 41 "significance_tests": { 42 "applies": true, 43 "answer": false, 44 "justification": "The paper claims THOUGHTCOMM outperforms baselines but provides no statistical significance tests (no p-values, t-tests, or bootstrap tests). Comparisons are based solely on point estimates with standard deviations." 45 }, 46 "effect_sizes_reported": { 47 "applies": true, 48 "answer": true, 49 "justification": "The paper reports absolute gains and relative improvements, e.g., '17.2% absolute gain over Multiagent Finetuning and a 113.3% relative improvement over the single answer baseline' (§5.2)." 50 }, 51 "sample_size_justified": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper uses 500 examples for fine-tuning and 500 for evaluation following prior work, but provides no justification for why these sizes are adequate." 55 }, 56 "variance_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Standard deviations are reported for accuracy scores in Table 1 (e.g., '± 2.23')." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": true, 65 "answer": true, 66 "justification": "The paper compares against single-LLM performance and Multiagent Finetuning (Subramaniam et al., 2025), described as 'current state-of-the-art.'" 67 }, 68 "baselines_contemporary": { 69 "applies": true, 70 "answer": true, 71 "justification": "Multiagent Finetuning (2025) is contemporary. The paper acknowledges other multi-agent workflows exist but focuses on validating the paradigm rather than exhaustive comparison." 72 }, 73 "ablation_study": { 74 "applies": true, 75 "answer": true, 76 "justification": "The paper varies prefix length (§5.4), number of debate rounds (§5.3), latent dimensionality (Appendix C.3), and number of agents (Appendix C.4), effectively ablating key design choices." 77 }, 78 "multiple_metrics": { 79 "applies": true, 80 "answer": true, 81 "justification": "The paper reports both accuracy and consensus score (proportion of unanimous decisions) as metrics." 82 }, 83 "human_evaluation": { 84 "applies": true, 85 "answer": false, 86 "justification": "No human evaluation is included. All evaluation is automated (accuracy against ground truths)." 87 }, 88 "held_out_test_set": { 89 "applies": true, 90 "answer": true, 91 "justification": "The paper uses 500 examples for fine-tuning and a separate 500 for evaluation (§5.2)." 92 }, 93 "per_category_breakdown": { 94 "applies": true, 95 "answer": true, 96 "justification": "Results are broken down by base model (5 different LLMs), by benchmark (MATH and GSM8K), and by various hyperparameter settings." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": false, 101 "justification": "No qualitative error analysis or failure cases are discussed. The paper does not examine where THOUGHTCOMM fails or produces incorrect answers." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "On LLaMA 3-8B-Instruct with GSM8K, THOUGHTCOMM (68.40%) slightly underperforms Multiagent Finetuning (69.20%), which is reported honestly in Table 1." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The abstract claims identifiability guarantees (supported by Theorems 1-3) and collaborative advantages (supported by Table 1 experiments). The claims are generally supported by results in the paper." 114 }, 115 "causal_claims_justified": { 116 "applies": true, 117 "answer": false, 118 "justification": "The paper claims THOUGHTCOMM 'enables' superior collaboration via 'mind-to-mind communication,' but the ablations do not isolate whether gains come from the latent thought extraction, the prefix injection, or the additional training signal. The design does not control for the additional parameters introduced by the autoencoder and adapter." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": false, 123 "justification": "The abstract claims the paradigm 'naturally extends beyond LLMs to all modalities' but experiments are limited to math reasoning tasks (MATH, GSM8K) with 5 specific LLMs. The title 'Multiagent Collaboration' is much broader than what is tested." 124 }, 125 "alternative_explanations_discussed": { 126 "applies": true, 127 "answer": false, 128 "justification": "The paper does not discuss whether the improvements could be due to the additional training (autoencoder + adapter) rather than the thought communication paradigm itself. No confound analysis is provided." 129 }, 130 "proxy_outcome_distinction": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper frames results as demonstrating 'collaborative advantages of thought communication' and 'mind-to-mind' communication but actually measures math problem accuracy. No discussion of whether math accuracy is a sufficient proxy for the broad collaboration claims." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": true, 140 "justification": "Specific model names with sizes are given: Llama-3-8B-Instruct, Phi-4-mini-instruct, Qwen-3-0.6B, Qwen-3-1.7B, Deepseek-R1-distilled-Llama-8B (§5.2)." 141 }, 142 "prompts_provided": { 143 "applies": true, 144 "answer": false, 145 "justification": "The paper describes using debate among agents but does not provide the actual prompts or system instructions used." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": false, 150 "justification": "The paper states prefix token count is 1 (Appendix C.1) but does not report temperature, top-p, learning rate for the autoencoder/adapter, or other critical hyperparameters." 151 }, 152 "scaffolding_described": { 153 "applies": true, 154 "answer": true, 155 "justification": "The THOUGHTCOMM framework is described in detail in §4: autoencoder for latent extraction, agreement-based reweighting, and prefix adaptation injection. The architecture is clearly specified." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "The paper describes the data split (500 train/500 test), selection of harder questions (level-3 complexity in MATH), and evaluation protocol (§5.2)." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 7 (Conclusion) discusses limitations: framework requires access to model states (not feasible for closed-source models), and the alternative of context-aware embeddings has not been explored empirically." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": true, 172 "justification": "The paper specifically notes that model states may be inaccessible for closed-source models, which is a concrete limitation specific to this approach." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": false, 177 "justification": "The paper does not explicitly state what it does NOT show. It does not bound results to math reasoning or specific model sizes. The framing ('beyond LLMs to all modalities') is expansive without clear boundary-setting." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": false, 184 "justification": "No raw experimental data (individual predictions, model states, latent representations) is released." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "Data comes from established benchmarks (MATH, GSM8K) with clear sampling described (500/500 split, level-3+ for MATH)." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants; data source is standard benchmarks." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": false, 199 "justification": "The pipeline from raw benchmark data to final evaluation is only partially described. How responses are parsed, how ground-truth matching works, and how the standard deviation is computed are not detailed." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": true, 206 "justification": "Acknowledgment section lists NSF Award No. 2229881, NIH Contract R01HL159805, and grants from Quris AI, Florin Court Capital, MBZUAI-WIS Joint Program, and Al Deira Causal Education project." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Author affiliations clearly listed: CMU, Meta AI, and MBZUAI." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": true, 216 "justification": "Funders (NSF, NIH, academic grants) do not have a direct financial stake in multi-agent communication outcomes." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests statement is included. Authors from Meta AI could have commercial interests in multi-agent LLM systems, but no declaration is made." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": true, 227 "answer": false, 228 "justification": "The paper evaluates pre-trained LLMs on MATH and GSM8K benchmarks but does not state the training data cutoff dates for any of the models used." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": true, 232 "answer": false, 233 "justification": "MATH (2021) and GSM8K (2021) are well-known benchmarks that likely appear in training data for models released in 2024-2025. No discussion of potential overlap." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": true, 237 "answer": false, 238 "justification": "Both MATH and GSM8K were published in 2021, well before the training cutoffs of all evaluated models. No contamination analysis or discussion is provided." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants in this study." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants in this study." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": false, 282 "justification": "No inference cost, latency, or per-example cost is reported. The paper claims the approach is 'efficient' but provides no quantitative cost data." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": false, 287 "justification": "The paper mentions 8 NVIDIA H100 GPUs but does not state total GPU hours, training time, or compute budget for the autoencoder/adapter training." 288 } 289 }, 290 "experimental_rigor": { 291 "seed_sensitivity_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "Standard deviations are reported but it is unclear whether they come from multiple seeds or are analytical estimates. No explicit seed sensitivity analysis is described." 295 }, 296 "number_of_runs_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "The paper does not explicitly state how many runs produced the reported results. The ± values in Table 1 appear to be standard errors computed from the binary outcomes rather than multi-run variance." 300 }, 301 "hyperparameter_search_budget": { 302 "applies": true, 303 "answer": false, 304 "justification": "No hyperparameter search budget is reported. The latent dimension sweep (§C.3) and prefix length sweep (§5.4) show sensitivity but do not report total search compute." 305 }, 306 "best_config_selection_justified": { 307 "applies": true, 308 "answer": false, 309 "justification": "The paper reports results for multiple configurations but does not explain how the default configuration (prefix=1, latent dim unspecified for main results) was selected." 310 }, 311 "multiple_comparison_correction": { 312 "applies": true, 313 "answer": false, 314 "justification": "Many comparisons are made across 5 models × 2 benchmarks × multiple settings with no correction for multiple comparisons." 315 }, 316 "self_comparison_bias_addressed": { 317 "applies": true, 318 "answer": false, 319 "justification": "The authors use the baseline's released code but compare their own THOUGHTCOMM implementation against it. No acknowledgment of potential author-evaluation bias." 320 }, 321 "compute_budget_vs_performance": { 322 "applies": true, 323 "answer": false, 324 "justification": "THOUGHTCOMM adds autoencoder + adapter training on top of the base model. Multiagent Finetuning fine-tunes the full LLM. The compute difference is discussed qualitatively but no matched-compute comparison is provided." 325 }, 326 "benchmark_construct_validity": { 327 "applies": true, 328 "answer": false, 329 "justification": "MATH and GSM8K are used to evaluate 'thought communication' and 'collaboration beyond language,' but no discussion of whether math accuracy is a valid measure of these broad claims." 330 }, 331 "scaffold_confound_addressed": { 332 "applies": true, 333 "answer": false, 334 "justification": "THOUGHTCOMM introduces a different communication mechanism (latent prefix injection) vs. the baseline (natural language debate). The comparison conflates the scaffold/communication mechanism with the method's value. No attempt to control for the scaffold difference." 335 } 336 }, 337 "data_leakage": { 338 "temporal_leakage_addressed": { 339 "applies": true, 340 "answer": false, 341 "justification": "MATH and GSM8K (both 2021) predate all evaluated models (2024-2025). No discussion of temporal leakage." 342 }, 343 "feature_leakage_addressed": { 344 "applies": true, 345 "answer": false, 346 "justification": "No discussion of whether the evaluation setup leaks information (e.g., through debate context or problem format)." 347 }, 348 "non_independence_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "The 500/500 train/test split is described as random sampling. No discussion of potential non-independence (e.g., similar problem structures across splits)." 352 }, 353 "leakage_detection_method": { 354 "applies": true, 355 "answer": false, 356 "justification": "No leakage detection or prevention methods are used or discussed." 357 } 358 } 359 }, 360 "claims": [ 361 { 362 "claim": "Latent shared and private thoughts between agents can be identified (disentangled) under sparsity regularization in a nonparametric setting.", 363 "evidence": "Theorems 1, 2, and 3 in §3 provide formal proofs. Synthetic experiments (§5.1, Figs. 3-4) validate recovery with R² scores and MCC above identifiability threshold.", 364 "supported": "strong" 365 }, 366 { 367 "claim": "THOUGHTCOMM achieves 67.23% average relative improvement over single-agent and 19.06% over the state-of-the-art Multiagent Finetuning.", 368 "evidence": "Table 1 in §5.2 shows results across 5 models and 2 benchmarks. Improvements are consistent across most settings.", 369 "supported": "moderate" 370 }, 371 { 372 "claim": "THOUGHTCOMM is robust to increasing debate rounds while Multiagent Finetuning degrades.", 373 "evidence": "Figure 6 (§5.3) shows accuracy drop for baseline but gains for THOUGHTCOMM from 2 to 6 rounds on LLaMA-3-8B-Instruct. Additional results on Qwen-3-1.7B in Appendix C.2.", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "THOUGHTCOMM is robust to prefix length, achieving near-optimal performance with a single injected vector.", 378 "evidence": "Figure 5 (§5.4) shows <5% fluctuation across prefix lengths 1-16 on four models.", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "THOUGHTCOMM's training overhead is model-agnostic, depending only on embedding dimension not parameter count.", 383 "evidence": "Stated in §5.2. Theoretical argument that autoencoder/adapter depend on embedding dimension. No empirical training cost data provided.", 384 "supported": "weak" 385 } 386 ], 387 "methodology_tags": ["theoretical", "benchmark-eval"], 388 "key_findings": "The paper introduces THOUGHTCOMM, a framework for multi-agent LLM communication that bypasses natural language by extracting and sharing latent thoughts via a sparsity-regularized autoencoder. Theoretical identifiability guarantees (Theorems 1-3) show shared and private latent thoughts can be recovered. On MATH and GSM8K benchmarks across 5 LLMs, THOUGHTCOMM outperforms Multiagent Finetuning by an average of 19% relative improvement. The approach shows robustness to varying prefix lengths, debate rounds, and number of agents.", 389 "red_flags": [ 390 { 391 "flag": "Benchmark contamination risk ignored", 392 "detail": "Both MATH (2021) and GSM8K (2021) are well-known benchmarks likely present in the training data of all 5 evaluated models (released 2024-2025). No contamination analysis is performed, which undermines the validity of absolute accuracy numbers. Relative comparisons between methods may still be valid if contamination affects all methods equally." 393 }, 394 { 395 "flag": "Confound between additional training and thought communication", 396 "detail": "THOUGHTCOMM trains an autoencoder and adapter module, providing additional learning signal. Multiagent Finetuning also trains, but via different mechanism. It is unclear whether gains come from the 'thought communication' paradigm or from the specific training procedure. No matched-compute or training-signal ablation is provided." 397 }, 398 { 399 "flag": "Overclaiming from narrow benchmarks", 400 "detail": "The paper claims to enable 'communication beyond language' and a paradigm for 'superhuman intelligence,' but experiments are limited to math problem-solving (MATH, GSM8K). These are well-structured tasks with clear answers — the claims far outrun the evidence." 401 }, 402 { 403 "flag": "No significance testing", 404 "detail": "Claims of 'outperforming' baselines are made across many comparisons without statistical tests. Some differences are within overlapping standard deviation ranges (e.g., LLaMA-3-8B GSM8K: 68.40 ± 2.08 vs 69.20 ± 2.06)." 405 }, 406 { 407 "flag": "Standard deviations may not represent multi-run variance", 408 "detail": "The ± values in Table 1 appear to be standard errors computed from the binomial distribution (binary correct/incorrect on 500 samples) rather than variance across multiple experimental runs. The paper does not clarify this, making it unclear whether the results are reproducible across runs." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "Improving factuality and reasoning in language models through multiagent debate", 414 "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba", "Joshua B Tenenbaum", "Igor Mordatch"], 415 "year": 2023, 416 "relevance": "Foundational multi-agent debate paper that THOUGHTCOMM builds upon and compares against." 417 }, 418 { 419 "title": "Why do multi-agent LLM systems fail?", 420 "authors": ["Mert Cemri", "Melissa Z Pan", "Shuyi Yang"], 421 "year": 2025, 422 "arxiv_id": "2503.13657", 423 "relevance": "Empirical analysis of multi-agent failure modes including vague message specification and inter-agent misalignment." 424 }, 425 { 426 "title": "Multiagent finetuning: Self improvement with diverse reasoning chains", 427 "authors": ["Vighnesh Subramaniam", "Yilun Du", "Joshua B Tenenbaum", "Antonio Torralba", "Shuang Li", "Igor Mordatch"], 428 "year": 2025, 429 "arxiv_id": "2501.05707", 430 "relevance": "Primary baseline — current state-of-the-art in multi-agent collaboration through finetuning." 431 }, 432 { 433 "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation", 434 "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"], 435 "year": 2023, 436 "arxiv_id": "2308.08155", 437 "relevance": "Major multi-agent LLM framework relevant to agentic AI survey scope." 438 }, 439 { 440 "title": "MetaGPT: Meta programming for multi-agent collaborative framework", 441 "authors": ["Sirui Hong", "Xiawu Zheng", "Jonathan Chen"], 442 "year": 2023, 443 "arxiv_id": "2308.00352", 444 "relevance": "Multi-agent collaboration framework with role-based agent design." 445 }, 446 { 447 "title": "CAMEL: Communicative agents for mind exploration of large language model society", 448 "authors": ["Guohao Li", "Hasan Hammoud", "Hani Itani", "Dmitrii Khizbullin", "Bernard Ghanem"], 449 "year": 2023, 450 "relevance": "Multi-agent communication framework exploring role-playing agent collaboration." 451 }, 452 { 453 "title": "Large language model based multi-agents: A survey of progress and challenges", 454 "authors": ["Taicheng Guo", "Xiuying Chen", "Yaqi Wang"], 455 "year": 2024, 456 "arxiv_id": "2402.01680", 457 "relevance": "Survey of LLM multi-agent systems covering communication, coordination, and challenges." 458 }, 459 { 460 "title": "DSPy: Compiling declarative language model calls into self-improving pipelines", 461 "authors": ["Omar Khattab", "Arnav Singhvi", "Paridhi Maheshwari"], 462 "year": 2023, 463 "arxiv_id": "2310.03714", 464 "relevance": "LLM pipeline framework relevant to multi-agent workflow design." 465 }, 466 { 467 "title": "Encouraging divergent thinking in large language models through multi-agent debate", 468 "authors": ["Tian Liang", "Zhiwei He", "Wenxiang Jiao"], 469 "year": 2023, 470 "arxiv_id": "2305.19118", 471 "relevance": "Multi-agent debate approach for improving LLM reasoning through diverse exchanges." 472 }, 473 { 474 "title": "The llama 3 herd of models", 475 "authors": ["Aaron Grattafiori", "Abhimanyu Dubey"], 476 "year": 2024, 477 "arxiv_id": "2407.21783", 478 "relevance": "Major open-source LLM family used as base model in experiments." 479 }, 480 { 481 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 482 "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"], 483 "year": 2025, 484 "arxiv_id": "2501.12948", 485 "relevance": "Reasoning-focused LLM used as base model in experiments." 486 }, 487 { 488 "title": "Collab: Controlled decoding using mixture of agents for LLM alignment", 489 "authors": ["Souradip Chakraborty", "Sujay Bhatt"], 490 "year": 2025, 491 "arxiv_id": "2503.21720", 492 "relevance": "Token-level multi-agent collaboration approach, alternative to debate-style communication." 493 } 494 ] 495 }