scan.json (31021B)
1 { 2 "paper": { 3 "title": "OSC: Cognitive Orchestration through Dynamic Knowledge Alignment in Multi-Agent LLM Collaboration", 4 "authors": [ 5 "Jusheng Zhang", 6 "Yijia Fan", 7 "Kaitong Cai", 8 "Xiaofei Sun", 9 "Keze Wang" 10 ], 11 "year": 2025, 12 "venue": "Conference on Empirical Methods in Natural Language Processing", 13 "arxiv_id": "2509.04876", 14 "doi": "10.48550/arXiv.2509.04876" 15 }, 16 "scan_version": 3, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "OSC proposes a multi-agent LLM collaboration framework using Collaborator Knowledge Models (CKM), cognitive gap analysis, and RL-optimized communication policies. On AlpacaEval 2.0, OSC achieves 81.4% LC Win Rate versus KABB (77.9%) and MoA (68.1%), and scores 9.94 on MT-Bench. Ablation shows CKM and the communication policy are critical components (removing either drops LC Win Rate to 71.2% and 69.4% respectively). Optimal performance occurs at 6 agents, with degradation at larger team sizes due to coordination overhead.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "No code repository URL, GitHub link, or archive is provided anywhere in the paper or appendix." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The paper uses publicly available benchmarks: AlpacaEval 2.0 (805 instructions) and MT-Bench. These are standard public datasets." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "While the paper mentions using Together Inference Endpoint and a single NVIDIA A100 80GB GPU, there is no requirements.txt, Dockerfile, or detailed environment specification sufficient to recreate the setup." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions, README, or scripts for replicating experiments are provided." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All results in Tables 1-6 and 9 report point estimates only. The scalability experiment mentions averaging over 3 runs but no confidence intervals or error bars are reported." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper makes numerous comparative claims (e.g., 'OSC outperforms KABB') based solely on comparing point estimates. No statistical significance tests (p-values, t-tests, etc.) are used anywhere." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Raw performance numbers with baseline context are provided throughout (e.g., OSC 81.4% vs KABB 77.9% vs MoA 68.1% LC Win Rate in Table 1), allowing readers to assess magnitude of differences." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "No justification for why 805 AlpacaEval instructions or 160 dev/validation subsets were chosen. No power analysis is provided." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "Section 4.4 states results were 'averaged over 3 independent runs to ensure robustness' but no standard deviations, IQR, or any spread measures are reported in any table." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Table 1 compares against KABB, MoA, multiple GPT-4 variants, individual LLMs (Qwen2, Gemma-2, WizardLM-2, LLaMa-3, DeepSeek-V3, DeepSeek-R1). Table 2 compares against TalkHier, REMALIS, DyLAN, and MAC." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "Baselines include DeepSeek-R1 (2025), DeepSeek-V3 (2024), GPT-4o (2024), and KABB (2025, ICML), representing current state of the art." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Table 3 ablates CKM, fgap, πcomm, and rshape. Table 6 provides fine-grained ablation of CKM feature dimensions, fupdate mechanism, action components, prompt simplification, and fgap alternatives. Table 9 ablates reward components." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "The paper reports LC Win Rate, standard Win Rate, MT-Bench average/per-turn scores, average communication rounds, token count, redundancy percentage, conflict resolution rate, and information density." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": true, 94 "justification": "Section 4.6 includes human evaluation by 3 reviewers who assessed dialogue clarity, relevance, and collaborativeness on a 1-5 scale across 3 case studies. However, this is extremely limited in scope." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "Section 4.4 states '160 instructions reserved for development and validation respectively.' AlpacaEval 2.0 is used as a standardized test set separate from the development data." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "MT-Bench results are broken down by 1st and 2nd turn. Communication metrics provide per-dimension breakdown (rounds, tokens, redundancy, conflict resolution, info density). Scalability results are broken down by agent count." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": false, 109 "justification": "No concrete failure examples or error analysis is provided. The limitations section mentions general degradation with larger teams but does not show specific failure cases or qualitative examples of where OSC produces poor outputs." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The scalability experiment (Table 4) shows performance degradation from 81.4% at 6 agents to 77.5% at 10 agents. The ablation study shows which components hurt when removed. Section 4.4 reports increased CKM update latency and memory issues with 10 agents." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": false, 121 "justification": "The contributions section claims 'OSC outperforms baselines on complex reasoning benchmarks (MATH)' but no MATH evaluation results are reported. MATH is mentioned only as a training environment (Section 7.6). This is a material mismatch between stated contributions and reported results." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The ablation study (Table 3) uses controlled single-variable manipulation, removing one component at a time while holding others constant. This is an adequate design for the causal claims about component contributions (e.g., 'CKM enables improved collaboration')." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title claims 'Multi-Agent LLM Collaboration' broadly, and the abstract claims results on 'complex reasoning and problem-solving benchmarks,' but evaluation is limited to AlpacaEval 2.0 (instruction following) and MT-Bench (multi-turn dialogue). No coding, mathematical reasoning, or domain-specific benchmarks are tested." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "No alternative explanations for the results are discussed. For instance, the paper does not consider whether improvements could be due to the additional compute/rounds rather than the CKM mechanism, or whether the GPT-4 judge has biases that favor OSC's output style." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper measures LC Win Rate (GPT-4 as judge) and frames it as 'task performance' and 'cognitive synergy' without discussing that GPT-4 judgments are a proxy with known biases (length bias, style preferences). The gap between LLM-as-judge scores and actual quality is not acknowledged." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": true, 148 "justification": "Specific model identifiers are provided: Qwen2-72B-Instruct, LLaMa-3-70B-Instruct, WizardLM-2-8x22B, Gemma-2-27B, Deepseek-V3, Deepseek-R1. GPT-4 variants include dates (e.g., 'GPT-4 Omni (05/13)')." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": false, 153 "justification": "Appendix 8.3 describes the prompt structure conceptually (Role and Context, CKM-derived Assessment, Strategic Insights, etc.) but does not provide actual prompt text. Only the schema of what prompts contain is given, not the verbatim prompts." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Tables 7 and 8 provide comprehensive hyperparameter settings including PPO parameters (learning rate 1e-4, γ=0.99, ε=0.2), policy network architecture (4 layers, 4 heads, d=256), CKM architecture (2 layers, 2 heads, d=128), reward shaping values, and fLLM generation parameters (temperature 0.7, top-p 0.9)." 159 }, 160 "scaffolding_described": { 161 "applies": true, 162 "answer": true, 163 "justification": "The OSC framework's scaffolding is described in extensive detail: CKM initialization and updates (Section 3.2), cognitive gap analysis (Section 3.3), communication policy (Section 3.4), linguistic realization (Section 3.4.1), round-robin speaking order, and the full pipeline from expert selection through aggregation." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": false, 168 "justification": "The paper mentions splitting AlpacaEval 2.0 into '160 instructions reserved for development and validation respectively' but does not describe how these splits were made, what preprocessing was applied to the instructions, or how the training data for CKM pretraining was processed." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "A dedicated 'Limitations' section discusses five specific limitations: scalability with increasing agent numbers, cognitive state modeling complexity, reliance on shaped rewards, hyperparameter sensitivity, and computational cost growth." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "The limitations are specific to this study: 'with 10 agents, there was an observed increase in CKM update latency and memory consumption per inference step,' 'conflict resolution dropping to 87.8%,' and 'agents sometimes misjudged collaborators' cognitive states.' These are concrete, study-specific issues." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": false, 185 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to the tested benchmarks (AlpacaEval 2.0, MT-Bench) or acknowledge that results may not transfer to coding, domain-specific, or multilingual tasks." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "No raw data (model outputs, agent dialogue logs, CKM states) is released for independent verification. Only aggregated metrics in tables are provided." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "The benchmarks used are standard and well-described: AlpacaEval 2.0 (805 instructions) and MT-Bench. Inference was conducted via the Together Inference Endpoint. Training setup uses PPO for 5×10^6 steps." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "The primary experiments use standard benchmarks (AlpacaEval 2.0, MT-Bench) with no human participants. The 3 human reviewers in the qualitative analysis are incidental and do not constitute a human subjects study." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": false, 207 "justification": "The pipeline from query input through CKM initialization, communication rounds, to aggregation is described conceptually, but specific data processing steps (how CKM pretraining data was prepared, how benchmark splits were created, filtering steps) are not documented with counts or criteria." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding source or acknowledgments section is present in the paper. With university and industry (Alibaba) affiliations, funding likely exists but is not disclosed." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are clearly listed: Sun Yat-sen University (Zhang, Fan, Cai, Wang) and Alibaba Group (Xiaofei Sun)." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "One author (Xiaofei Sun) is from Alibaba Group, and Qwen2-72B-Instruct (an Alibaba model) serves as the aggregator in all experiments. The choice of Alibaba's model as the critical aggregator component while an Alibaba employee is co-author represents a non-independent relationship that is not acknowledged." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests statement or financial disclosure is present in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "No training data cutoff dates are stated for any of the six models used (Qwen2-72B, LLaMa-3-70B, WizardLM-2, Gemma-2-27B, DeepSeek-V3, DeepSeek-R1)." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": false, 241 "justification": "AlpacaEval 2.0 and MT-Bench are well-known public benchmarks that could be in the training data of the models used. No discussion of potential overlap." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": false, 246 "justification": "AlpacaEval 2.0 and MT-Bench have been publicly available since 2023, and several models used (DeepSeek-R1, DeepSeek-V3) were trained after their release. No contamination analysis is performed." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "The paper has no human subjects study. The 3 reviewers in the qualitative analysis are evaluators, not participants in a study." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants study; this is a benchmark evaluation paper." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": true, 290 "justification": "Section 4.5 provides cost per instruction ($0.97 for OSC N=6 vs $0.91 for KABB) on AlpacaEval 2.0. Figure 2 visualizes the price-performance frontier. Section 11 reports inference latency of 1.79s per instruction." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": true, 295 "justification": "Section 11 reports 10.8 GPU hours for training and 11.3 GB training memory on a single NVIDIA A100 80GB GPU. Training uses 5×10^6 environment timesteps (main experiments) or 1×10^6 (efficiency experiments)." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "The scalability experiment mentions 'averaged over 3 independent runs' but no seed sensitivity analysis or variance across seeds is reported for any experiment, including the main results in Table 1." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": false, 307 "justification": "Only the scalability experiment (Section 4.4) and hyperparameter tuning (Section 9) state '3 independent runs.' The main results (Table 1), communication efficiency (Table 2), and ablation (Table 3) do not state how many runs produced them." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": true, 312 "justification": "Section 9 describes a grid search over Nround × λcost (4×3 = 12 configurations, each run 3 times = 36 total runs) on the AlpacaEval 2.0 development set." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "Section 9 states configuration selection was done on the development set (160 instructions), selecting 'the configuration with the highest LC win rate and reasonable rounds and token count.' Selection was on dev, not test." 318 }, 319 "multiple_comparison_correction": { 320 "applies": true, 321 "answer": false, 322 "justification": "The paper makes numerous comparisons across multiple models, configurations, and ablation variants without any statistical tests, let alone corrections for multiple comparisons." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors implement and evaluate their own OSC system against baselines including KABB (from the same first author group — Zhang et al., 2025d). No acknowledgment of self-comparison bias or independent evaluation." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": true, 332 "justification": "Section 4.5 and Figure 2 explicitly analyze price-performance tradeoffs, showing performance as a function of cost per instruction across different numbers of active agents (N=1 to N=6)." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "AlpacaEval 2.0 uses GPT-4 as judge, which has documented biases (length bias, style preferences). The paper does not discuss whether GPT-4 judgments actually measure the 'cognitive synergy' or 'deep collaboration' being claimed." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "OSC is itself the scaffolding framework being tested. Comparisons are against other scaffolding approaches (KABB, MoA) using the same model pool, so the scaffold IS the thing being evaluated." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "AlpacaEval 2.0 was published in 2023. Several models used (DeepSeek-R1, DeepSeek-V3, 2024-2025) were trained after its release and may have seen its instructions. This is not discussed." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of whether the evaluation setup leaks information. The use of GPT-4 as both a component model referenced and the AlpacaEval judge is not discussed." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether the 805 AlpacaEval instructions or MT-Bench questions are independent of training data or of each other." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference tests, or decontamination pipelines are mentioned." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "OSC achieves the highest LC win rate on AlpacaEval 2.0 at 81.4%, outperforming KABB (77.9%) and MoA (68.1%).", 371 "evidence": "Table 1, Section 4.1. Comparison across multiple models and frameworks. Point estimates without error bars or statistical tests.", 372 "supported": "moderate" 373 }, 374 { 375 "claim": "OSC sets a new state-of-the-art on MT-Bench with an average score of 9.94, outperforming KABB (9.65) and MoA (9.41).", 376 "evidence": "Table 1, Section 4.1. MT-Bench scores reported for first turn (9.96) and second turn (9.73). No variance reported.", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "OSC surpasses SOTA multi-agent frameworks in communication efficiency, completing tasks in 4.6 rounds and 3.31k tokens compared to TalkHier (4.9, 3.52k), REMALIS (5.2, 3.78k), DyLAN (5.5, 3.95k), and MAC (5.7, 4.15k).", 381 "evidence": "Table 2, Section 4.2. Communication metrics reported. No variance or statistical tests.", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "Removing CKM drops LC Win Rate from 81.4% to 71.2%, and removing πcomm drops it to 69.4%, demonstrating these are critical components.", 386 "evidence": "Table 3, Section 4.3. Ablation with 5×10^6 PPO timesteps. No variance reported across runs.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "Optimal performance is achieved with 6 agents (81.4% LC Win Rate), with degradation at 8 agents (80.2%) and 10 agents (77.5%).", 391 "evidence": "Table 4, Section 4.4. Averaged over 3 independent runs, but no standard deviations reported.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "Fine-tuning CKM and fgap end-to-end improves LC Win Rate from 76.8% (pretrain-only) to 81.4% and reduces average rounds from 5.1 to 4.3.", 396 "evidence": "Section 5, Figure 3. Comparison of pretrain-only vs pretrain+fine-tune configurations.", 397 "supported": "moderate" 398 }, 399 { 400 "claim": "OSC offers comparable or better LC Win Rates than proprietary models like GPT-4o and Claude-3.7 at lower costs.", 401 "evidence": "Section 4.5, Figure 2. Cost per instruction $0.97 for OSC N=6. Proprietary model costs from OpenRouter API.", 402 "supported": "weak" 403 } 404 ], 405 "red_flags": [ 406 { 407 "flag": "MATH benchmark claim not substantiated", 408 "detail": "The contributions section specifically claims 'OSC outperforms baselines on complex reasoning benchmarks (MATH)' but no MATH evaluation results are reported anywhere in the paper. MATH is only mentioned as a training environment in Appendix 7.6. This is a material mismatch between stated contributions and reported evidence." 409 }, 410 { 411 "flag": "Suspiciously high MT-Bench scores", 412 "detail": "OSC achieves 9.94 average on MT-Bench (near-perfect on a 10-point scale), substantially exceeding GPT-4 variants (8.84-9.31) and even DeepSeek-R1 (9.30). A 6-model ensemble scoring this high warrants skepticism, especially without variance reporting or ablation specific to MT-Bench." 413 }, 414 { 415 "flag": "No error bars despite claiming multiple runs", 416 "detail": "The scalability experiment claims 'averaged over 3 independent runs' but reports no standard deviations or variance measures. The main results (Table 1) do not state how many runs were conducted. Without variance, the claimed differences (e.g., 81.4% vs 77.9%) cannot be assessed for statistical significance." 417 }, 418 { 419 "flag": "Undisclosed Alibaba conflict of interest", 420 "detail": "Co-author Xiaofei Sun is from Alibaba Group, and Alibaba's Qwen2-72B-Instruct serves as the aggregator in all experiments. The aggregator is a critical component that processes all agent outputs into the final answer. This conflict is not acknowledged." 421 }, 422 { 423 "flag": "Self-citation baseline (KABB)", 424 "detail": "The primary baseline KABB is from the same first-author group (Zhang et al., 2025d, ICML). Self-comparison bias is well-documented — authors' implementations of their own systems tend to be more carefully tuned than their implementations of baselines." 425 }, 426 { 427 "flag": "No contamination analysis on well-known benchmarks", 428 "detail": "AlpacaEval 2.0 and MT-Bench are widely known benchmarks available since 2023. Several models used (DeepSeek-R1, DeepSeek-V3) were trained after their release. No training cutoff dates or contamination analysis is provided." 429 }, 430 { 431 "flag": "Communication efficiency metrics are self-defined", 432 "detail": "Metrics like 'Communication Redundancy,' 'Conflict Resolution Rate,' and 'Task-Relevant Information Density' in Table 2 are not standardized metrics. Their definitions and measurement procedures are not provided in the paper, making comparison across frameworks unverifiable." 433 } 434 ], 435 "cited_papers": [ 436 { 437 "title": "Large language model based multi-agents: A survey of progress and challenges", 438 "authors": ["Taicheng Guo", "Xiuying Chen", "Yaqi Wang"], 439 "year": 2024, 440 "relevance": "Survey of multi-agent LLM systems covering progress and challenges in the field." 441 }, 442 { 443 "title": "Improving factuality and reasoning in language models through multiagent debate", 444 "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba"], 445 "year": 2023, 446 "arxiv_id": "2305.14325", 447 "relevance": "Multi-agent debate framework for improving LLM factuality and reasoning, a key related approach to inter-agent communication." 448 }, 449 { 450 "title": "Mixture-of-agents enhances large language model capabilities", 451 "authors": ["Junlin Wang", "Jue Wang", "Ben Athiwaratkun"], 452 "year": 2024, 453 "arxiv_id": "2406.04692", 454 "relevance": "Mixture-of-Agents (MoA) framework, one of the main baselines in this paper." 455 }, 456 { 457 "title": "MetaGPT: Meta programming for a multi-agent collaborative framework", 458 "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"], 459 "year": 2024, 460 "relevance": "Structured multi-agent collaborative framework assigning roles to LLM agents for software development." 461 }, 462 { 463 "title": "CAMEL: Communicative agents for 'mind' exploration of large language model society", 464 "authors": ["Guohao Li", "Hasan Abed Al Kader Hammoud"], 465 "year": 2023, 466 "relevance": "Framework for studying communicative agent interactions in LLM multi-agent settings." 467 }, 468 { 469 "title": "Exploring collaboration mechanisms for LLM agents: A social psychology view", 470 "authors": ["Jintian Zhang", "Xin Xu", "Ningyu Zhang"], 471 "year": 2024, 472 "arxiv_id": "2310.02124", 473 "relevance": "Studies collaboration mechanisms for LLM agents from social psychology perspective." 474 }, 475 { 476 "title": "KABB: Knowledge-aware bayesian bandits for dynamic expert coordination in multi-agent systems", 477 "authors": ["Jusheng Zhang", "Zimeng Huang", "Yijia Fan"], 478 "year": 2025, 479 "relevance": "Primary baseline; knowledge-aware routing framework for multi-agent expert coordination, from same author group." 480 }, 481 { 482 "title": "Debating with more persuasive LLMs leads to more truthful answers", 483 "authors": ["Akbir Khan", "John Hughes", "Dan Valentine"], 484 "year": 2024, 485 "relevance": "Studies how inter-LLM debate dynamics affect answer quality, directly relevant to multi-agent communication." 486 }, 487 { 488 "title": "Proximal policy optimization algorithms", 489 "authors": ["John Schulman", "Filip Wolski", "Prafulla Dhariwal"], 490 "year": 2017, 491 "arxiv_id": "1707.06347", 492 "relevance": "PPO algorithm used as the core RL optimization method for OSC's communication policy." 493 }, 494 { 495 "title": "Are more LLM calls all you need? Towards scaling laws of compound inference systems", 496 "authors": ["Lingjiao Chen", "Jared Quincy Davis", "Boris Hanin"], 497 "year": 2024, 498 "arxiv_id": "2403.02419", 499 "relevance": "Investigates scaling behavior of compound LLM systems, relevant to multi-agent collaboration efficiency." 500 }, 501 { 502 "title": "Multi-agent software development through cross-team collaboration", 503 "authors": ["Zhuoyun Du", "Chen Qian", "Wei Liu"], 504 "year": 2024, 505 "arxiv_id": "2406.08979", 506 "relevance": "Multi-agent system for software development through cross-team collaboration." 507 }, 508 { 509 "title": "Rethinking the bounds of LLM reasoning: Are multi-agent discussions the key?", 510 "authors": ["Qineng Wang", "Zihao Wang", "Ying Su"], 511 "year": 2024, 512 "relevance": "Investigates whether multi-agent discussions improve LLM reasoning capabilities." 513 } 514 ], 515 "engagement_factors": { 516 "practical_relevance": { 517 "score": 1, 518 "justification": "Proposes a multi-agent collaboration framework but no code release — practitioners cannot use it." 519 }, 520 "surprise_contrarian": { 521 "score": 1, 522 "justification": "Incremental improvement on multi-agent LLM collaboration; does not challenge conventional wisdom." 523 }, 524 "fear_safety": { 525 "score": 0, 526 "justification": "No safety or risk concerns raised; purely about performance optimization." 527 }, 528 "drama_conflict": { 529 "score": 0, 530 "justification": "No controversial claims or challenges to existing work." 531 }, 532 "demo_ability": { 533 "score": 0, 534 "justification": "No code, demo, or installable tool is available." 535 }, 536 "brand_recognition": { 537 "score": 1, 538 "justification": "Sun Yat-sen University and Alibaba Group have some recognition but are not top-tier AI lab brands." 539 } 540 } 541 }