scan.json (25638B)
1 { 2 "paper": { 3 "title": "Collab: Controlled Decoding using Mixture of Agents for LLM Alignment", 4 "authors": [ 5 "Souradip Chakraborty", 6 "Sujay Bhatt", 7 "Udari Madhushani Sehwag", 8 "Alec Koppel", 9 "Soumya Suvra Ghosal", 10 "Jiahao Qiu", 11 "Mengdi Wang", 12 "Dinesh Manocha", 13 "Furong Huang", 14 "Sumitra Ganesh" 15 ], 16 "year": 2025, 17 "venue": "ICLR 2025", 18 "arxiv_id": "2503.21720" 19 }, 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "No repository URL or code archive link is provided anywhere in the paper. The paper mentions 'Reproducibility is ensured through the use of publicly available resources' (Section 5), but this refers to the datasets and models used, not their own code release." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The paper uses publicly available datasets: Berkeley Nectar (Zhu et al., 2023) and HH-RLHF (Bai et al., 2022), as detailed in Table 3. All models used are open-source (Zephyr-7B, Starling-7B, etc.)." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "Appendix A states 'We run all experiments with Python 3.7.4 and PyTorch 1.9.0. For all experimentation, we use two Nvidia RTX A6000 GPUs.' This mentions Python and PyTorch versions plus GPU type, but there is no requirements.txt, Dockerfile, or detailed dependency list sufficient to recreate the full environment." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions are provided. There is no README, no scripts to replicate experiments, and no repository. The paper describes the algorithm (Algorithm 1) but does not provide instructions for replicating specific experimental results." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "The results in Figure 2 (average rewards), Figure 3 (diversity and coherence), and Table 1 (GPT-4 win-tie rates) are all presented as point estimates without confidence intervals or error bars." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper claims Collab 'consistently outperforms' baselines and achieves 'up to 1.56x improvement' but no statistical significance tests (p-values, t-tests, etc.) are reported to support these comparative claims." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "The abstract states 'achieving an improvement of up to 1.56x in average reward and 71.89% in GPT-4 based win-tie rate.' Table 1 provides specific win-tie percentages for each evaluation setup, and Figure 2 shows normalized average rewards with baseline context." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The paper states '300 prompts' were sampled for GPT-4 evaluation but does not justify this sample size. The number of test prompts for the average reward evaluations is not specified, and no power analysis is provided." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "No variance, standard deviation, or spread measures are reported for any results. The figures show single-run bar charts without error bars. There is no mention of averaging across multiple runs or seeds." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "The paper compares against individual agent decoding (Agent-I, Agent-II using SoTA decoding from Chakraborty et al., 2024b) and Best-of-N (BoN) sampling (Nakano et al., 2021), as described in Section 5 and shown in Figures 2-3 and Table 1." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "The baselines use state-of-the-art controlled decoding methods (TQ* from Chakraborty et al., 2024b; ARGS from Khanov et al., 2024), which are contemporaneous with this work. The individual agents use SoTA decoding approaches." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Figure 4a shows an ablation comparing 'Switch without Diversity' (two similar models) vs. 'Switch with Diversity' (two diverse agents), demonstrating the importance of agent diversity. Figure 4b shows the effect of increasing the number of diverse agents." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "The paper reports four metrics: average reward (Figure 2), GPT-4 win-tie rate (Table 1), diversity (Figure 3), and coherence (Figure 3)." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "No human evaluation is conducted. GPT-4 is used as a 'surrogate for human assessment' (Section 5), but this is an automated evaluation, not actual human judgment. The paper evaluates alignment quality, where human evaluation of outputs would be relevant." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "The paper refers to a 'test set' and 'test prompts' (Section 5), and the evaluation is conducted on prompts from established benchmark datasets (Berkeley Nectar, HH-RLHF) which have standard train/test splits." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Results are broken down across 7 evaluation setups (Evaluation 1-7) spanning two tasks (Task-I and Task-II), as shown in Table 1 and Figures 2-3, with different model combinations per setup as detailed in Table 3." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": false, 109 "justification": "The paper does not discuss failure cases or where the approach breaks down. Examples in Appendix G show outputs from different strategies but do not highlight failures of the proposed method. Example-3 (the chicken math problem) shows Collab getting the correct answer (54), but there is no systematic error analysis." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": false, 114 "justification": "Every experiment shows improvement for Collab over baselines. No configurations or settings that failed or hurt performance are reported." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims '1.56x in average reward' and '71.89% in GPT-4 based win-tie rate.' Table 1 shows win-tie rates, with Evaluation-2 for Collab vs Agent-I at 71.89%. Figure 2 shows normalized reward improvements consistent with the 1.56x claim. The theoretical claims about optimality bounds are proven in Theorem 1." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper makes causal claims through ablation studies (Figure 4a: diversity improves performance; Figure 4b: more agents improve performance) and controlled comparisons (same prompts, same datasets, varying only the decoding strategy). The ablation design involves controlled single-variable manipulation." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The paper's title and claims are framed broadly as 'LLM Alignment' but experiments are conducted only with 7B-parameter open-source models on two datasets (Berkeley Nectar and HH-RLHF). The paper does not explicitly state boundaries on generalization to larger models, other task types, or different model families." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, it does not address whether improvements could be attributed to effectively having more compute at inference time (evaluating multiple models), or whether the reward model used for evaluation biases toward certain response styles." 137 } 138 }, 139 "setup_transparency": { 140 "model_versions_specified": { 141 "applies": true, 142 "answer": false, 143 "justification": "Table 3 lists model names like 'Zephyr-7B-α', 'Starling-7B-α', 'Mistral-7B-α-IT', 'Dolphin-2.6-Mistral-7B-DPO', etc. While these are specific model names, no Hugging Face model IDs, snapshot dates, or version hashes are provided. GPT-4 is used for evaluation but no version (e.g., gpt-4-0613) is specified." 144 }, 145 "prompts_provided": { 146 "applies": true, 147 "answer": false, 148 "justification": "The GPT-4 evaluation prompt is described in natural language ('We prompt GPT-4 to rate responses from various decoding strategies on relevance, accuracy, and insightfulness, scoring them from 1 to 10') but the actual prompt text is not provided." 149 }, 150 "hyperparameters_reported": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 5 states 'we set the number of tokens sampled (top-p) p = 10 and the decoding alignment parameter α = 1.' Maximum prompt length (128) and continuation length (2048) are specified. Greedy-based sampling is stated for baselines." 154 }, 155 "scaffolding_described": { 156 "applies": false, 157 "answer": false, 158 "justification": "The paper does not use agentic scaffolding in the sense of tools, retry logic, or feedback mechanisms. The term 'agents' here refers to different LLM policies combined at the token decoding level, not an agentic system with tools." 159 }, 160 "data_preprocessing_documented": { 161 "applies": true, 162 "answer": false, 163 "justification": "The paper does not describe how prompts were selected from Berkeley Nectar or HH-RLHF datasets, how many prompts were used for the main evaluations (only 300 mentioned for GPT-4 evaluation), or any filtering/preprocessing steps applied to the datasets." 164 } 165 }, 166 "limitations_and_scope": { 167 "limitations_section_present": { 168 "applies": true, 169 "answer": false, 170 "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The paper has an Introduction, Related Work, Problem Formulation, Theoretical Analysis, Experimental Evaluations, Acknowledgments, Disclaimer, and Appendices, but no limitations discussion." 171 }, 172 "threats_to_validity_specific": { 173 "applies": true, 174 "answer": false, 175 "justification": "No threats to validity are discussed anywhere in the paper. There is no discussion of potential issues with the evaluation methodology, model selection, or generalizability of results." 176 }, 177 "scope_boundaries_stated": { 178 "applies": true, 179 "answer": false, 180 "justification": "The paper does not explicitly state what the results do not show. There is no discussion of limitations regarding model size (only 7B tested), dataset coverage, or situations where the approach might not work." 181 } 182 }, 183 "data_integrity": { 184 "raw_data_available": { 185 "applies": true, 186 "answer": false, 187 "justification": "No raw experimental data (generated responses, reward scores, per-prompt results) is made available. Only aggregated metrics are shown in figures and tables." 188 }, 189 "data_collection_described": { 190 "applies": true, 191 "answer": true, 192 "justification": "Section 5 describes the data sources: Berkeley Nectar dataset for Task-I (multi-turn dialogues and question answering) and HH-RLHF dataset for Task-II (helpfulness and ethical alignment). Table 3 details the specific model combinations for each evaluation setup." 193 }, 194 "recruitment_methods_described": { 195 "applies": false, 196 "answer": false, 197 "justification": "No human participants are involved. The study uses standard public benchmark datasets." 198 }, 199 "data_pipeline_documented": { 200 "applies": true, 201 "answer": false, 202 "justification": "The pipeline from raw dataset prompts to final evaluation results is not fully documented. The paper does not describe how test prompts were selected, how the 300 prompts for GPT-4 evaluation were sampled, or intermediate processing steps." 203 } 204 }, 205 "conflicts_of_interest": { 206 "funding_disclosed": { 207 "applies": true, 208 "answer": true, 209 "justification": "The Acknowledgments section lists specific funding sources: DARPA TIAMAT 80321, NSF-IIS-2147276, DOD-ONR N00014-22-1-2335, DOD-AFOSR FA9550-23-1-0048, DOD-DARPA GARD HR00112020007, Adobe, Capital One, and JP Morgan faculty fellowships." 210 }, 211 "affiliations_disclosed": { 212 "applies": true, 213 "answer": true, 214 "justification": "Author affiliations are clearly listed: JPMorgan AI Research, University of Maryland College Park, and Princeton University. The paper header identifies the first author's work as 'done as a part of an internship at JPMorgan AI Research.'" 215 }, 216 "funder_independent_of_outcome": { 217 "applies": true, 218 "answer": true, 219 "justification": "The funders (DARPA, NSF, DOD, Adobe, Capital One, JP Morgan) do not have a direct financial stake in the specific outcome of this alignment method comparison. JPMorgan AI Research is a co-affiliation but the paper evaluates open-source models and publicly available datasets, not JPMorgan products." 220 }, 221 "financial_interests_declared": { 222 "applies": true, 223 "answer": false, 224 "justification": "No competing interests or financial interests statement is present in the paper. There is a JPMorgan disclaimer about informational purposes but no explicit declaration of personal financial interests (patents, equity, etc.)." 225 } 226 }, 227 "contamination": { 228 "training_cutoff_stated": { 229 "applies": true, 230 "answer": false, 231 "justification": "The paper uses pre-trained models (Zephyr-7B, Starling-7B, Mistral-7B, Dolphin models) but does not state their training data cutoff dates. The evaluation benchmarks (Berkeley Nectar, HH-RLHF) could potentially overlap with model training data." 232 }, 233 "train_test_overlap_discussed": { 234 "applies": true, 235 "answer": false, 236 "justification": "No discussion of whether the evaluation prompts from Berkeley Nectar or HH-RLHF may have been seen during the training of the models used (Zephyr, Starling, Dolphin, Mistral). HH-RLHF in particular is widely used and likely in many models' training data." 237 }, 238 "benchmark_contamination_addressed": { 239 "applies": true, 240 "answer": false, 241 "justification": "HH-RLHF (Bai et al., 2022) and Berkeley Nectar (Zhu et al., 2023) are public datasets published before many of the models used were trained. The paper does not address whether these benchmarks were in the models' training data." 242 } 243 }, 244 "human_studies": { 245 "pre_registered": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants are involved in this study." 249 }, 250 "irb_or_ethics_approval": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants are involved in this study." 254 }, 255 "demographics_reported": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved in this study." 259 }, 260 "inclusion_exclusion_criteria": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved in this study." 264 }, 265 "randomization_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved in this study." 269 }, 270 "blinding_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants are involved in this study." 274 }, 275 "attrition_reported": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants are involved in this study." 279 } 280 }, 281 "cost_and_practicality": { 282 "inference_cost_reported": { 283 "applies": true, 284 "answer": false, 285 "justification": "The method requires running multiple LLMs and evaluating implicit Q-functions at each token, which is substantially more expensive than single-agent decoding. No inference cost, latency, or tokens-per-second measurements are reported." 286 }, 287 "compute_budget_stated": { 288 "applies": true, 289 "answer": false, 290 "justification": "Appendix A mentions 'two Nvidia RTX A6000 GPUs' but does not state total GPU hours, wall-clock time for experiments, or any cost estimate for running the multi-agent decoding approach." 291 } 292 } 293 }, 294 "claims": [ 295 { 296 "claim": "Collab achieves up to 1.56x improvement in average reward over single-agent decoding baselines.", 297 "evidence": "Figure 2 shows normalized average reward values across 6 evaluation setups, with Collab consistently outperforming Agent-I, Agent-II, and BoN sampling baselines.", 298 "supported": "moderate" 299 }, 300 { 301 "claim": "Collab achieves 71.89% GPT-4 based win-tie rate against the best individual agent.", 302 "evidence": "Table 1 shows GPT-4 evaluation results. Collab vs Agent-I on Task-I Evaluation-2 achieves 71.89% win-tie rate. Results range from 50.00% to 73.75% across setups.", 303 "supported": "moderate" 304 }, 305 { 306 "claim": "The sub-optimality of the multi-agent decoding policy is bounded by the reward difference between the target and best model plus KL divergence terms.", 307 "evidence": "Theorem 1 (Section 4) with full proof in Appendix D.1 establishes the sub-optimality bound. The proof uses Lemma 1 and properties of KL-regularized RL.", 308 "supported": "strong" 309 }, 310 { 311 "claim": "Diversity among agents enhances collaborative performance in the mixture.", 312 "evidence": "Figure 4a compares 'Switch without Diversity' vs 'Switch with Diversity', showing improved average reward with diverse agents. Figure 4b shows increasing reward with more diverse agents.", 313 "supported": "moderate" 314 }, 315 { 316 "claim": "Collab outperforms baselines in diversity and coherence metrics.", 317 "evidence": "Figure 3 shows diversity and coherence values for Evaluations 1-3, with Collab outperforming other baselines in both metrics.", 318 "supported": "moderate" 319 } 320 ], 321 "methodology_tags": [ 322 "benchmark-eval", 323 "theoretical" 324 ], 325 "key_findings": "Collab proposes a mixture-of-agents controlled decoding strategy that dynamically selects the most suitable LLM at each token position using an implicit Q-function as the guiding metric. The approach provides theoretical sub-optimality bounds showing performance improves when the best agent's reward function is close to the target. Empirical evaluations on 7 setups using 7B open-source models show consistent improvements over single-agent decoding baselines, with up to 1.56x average reward improvement and 71.89% GPT-4 win-tie rate. Agent diversity is shown to be important for collaborative performance gains.", 326 "red_flags": [ 327 { 328 "flag": "No uncertainty quantification", 329 "detail": "All results are reported as point estimates without confidence intervals, error bars, or variance across runs. For a method involving stochastic decoding from multiple models, the lack of any variance reporting is a significant gap — it is unclear whether the improvements are within noise." 330 }, 331 { 332 "flag": "No statistical significance tests", 333 "detail": "Comparative claims ('outperforms', 'consistently outperforms') are made solely by comparing raw numbers without any statistical tests. Some win-tie rates are as low as 50.00% (Evaluation 5 vs BoN), suggesting Collab does not always meaningfully outperform baselines." 334 }, 335 { 336 "flag": "No limitations section", 337 "detail": "A published ICLR paper with no limitations, threats to validity, or scope boundaries discussion. The method's practical computational overhead (running multiple models per token) is never acknowledged or quantified." 338 }, 339 { 340 "flag": "Inference cost not reported", 341 "detail": "The method requires sampling top-p tokens from each of K agents and evaluating the implicit Q-function for each, making it substantially more expensive than single-agent decoding. This cost is never quantified, making practical applicability difficult to assess." 342 }, 343 { 344 "flag": "GPT-4 as sole quality evaluator", 345 "detail": "GPT-4 is used as a 'surrogate for human assessment' but no actual human evaluation is conducted. GPT-4 evaluation has known biases (position bias, verbosity preference, self-preference) which are not discussed." 346 }, 347 { 348 "flag": "Benchmark contamination unaddressed", 349 "detail": "HH-RLHF (2022) and Berkeley Nectar (2023) are public datasets that likely appeared in the training data of the models evaluated (Zephyr, Starling, Dolphin, Mistral). This potential contamination is never discussed." 350 }, 351 { 352 "flag": "Limited model scale", 353 "detail": "All experiments use 7B-parameter models only, but conclusions are framed broadly for 'LLM Alignment' without acknowledging this restriction." 354 } 355 ], 356 "cited_papers": [ 357 { 358 "title": "Controlled decoding from language models", 359 "authors": ["Sidharth Mudgal", "Jong Lee", "Harish Ganapathy"], 360 "year": 2024, 361 "relevance": "Foundational work on inference-time alignment via controlled decoding that this paper builds upon and extends to multi-agent settings." 362 }, 363 { 364 "title": "Transfer Q Star: Principled Decoding for LLM Alignment", 365 "authors": ["Souradip Chakraborty", "Soumya Suvra Ghosal", "Ming Yin"], 366 "year": 2024, 367 "arxiv_id": "2405.20495", 368 "relevance": "Prior single-agent principled decoding method that serves as a key baseline and theoretical predecessor to Collab." 369 }, 370 { 371 "title": "Direct preference optimization: Your language model is secretly a reward model", 372 "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"], 373 "year": 2023, 374 "relevance": "Foundational alignment method (DPO) that Collab aims to complement or replace through inference-time decoding rather than fine-tuning." 375 }, 376 { 377 "title": "Training language models to follow instructions with human feedback", 378 "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"], 379 "year": 2022, 380 "relevance": "Seminal RLHF paper establishing the training-based alignment paradigm that decoding methods like Collab aim to avoid." 381 }, 382 { 383 "title": "Learning to decode collaboratively with multiple language models", 384 "authors": ["Zejiang Shen", "Hunter Lang", "Bailin Wang"], 385 "year": 2024, 386 "doi": "10.18653/v1/2024.acl-long.701", 387 "relevance": "Concurrent work on collaborative multi-model decoding, directly relevant to multi-agent LLM collaboration approaches." 388 }, 389 { 390 "title": "ARGS: Alignment as Reward-Guided Search", 391 "authors": ["Maxim Khanov", "Jirayu Burapacheep", "Yixuan Li"], 392 "year": 2024, 393 "relevance": "State-of-the-art decoding alignment baseline used in the paper's experimental comparisons." 394 }, 395 { 396 "title": "DEAL: Decoding-time alignment for large language models", 397 "authors": ["James Y Huang", "Sailik Sengupta", "Daniele Bonadiman"], 398 "year": 2024, 399 "relevance": "Alternative approach to inference-time LLM alignment that reframes text generation as a search problem." 400 }, 401 { 402 "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback", 403 "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"], 404 "year": 2022, 405 "arxiv_id": "2204.05862", 406 "relevance": "Source of the HH-RLHF dataset used for evaluation and foundational work on safety alignment." 407 }, 408 { 409 "title": "Collaborative decoding of critical tokens for boosting factuality of large language models", 410 "authors": ["Lifeng Jin", "Baolin Peng", "Linfeng Song"], 411 "year": 2024, 412 "arxiv_id": "2402.17982", 413 "relevance": "Related work on collaborative multi-model decoding focused on factuality, relevant to multi-agent LLM coordination." 414 }, 415 { 416 "title": "Tuning language models by proxy", 417 "authors": ["Alisa Liu", "Xiaochuang Han", "Yizhong Wang"], 418 "year": 2024, 419 "arxiv_id": "2401.08565", 420 "relevance": "Alternative approach to leveraging multiple models for alignment without direct fine-tuning." 421 }, 422 { 423 "title": "Scaling laws for reward model overoptimization", 424 "authors": ["Leo Gao", "John Schulman", "Jacob Hilton"], 425 "year": 2023, 426 "arxiv_id": "2210.10760", 427 "relevance": "Documents reward overoptimization in RLHF, a key motivation for inference-time alignment approaches like Collab." 428 } 429 ] 430 }