scan.json (27739B)
1 { 2 "paper": { 3 "title": "AdaFuse: Adaptive Ensemble Decoding with Test-Time Scaling for LLMs", 4 "authors": [ 5 "Chengming Cui", 6 "Tianxin Wei", 7 "Ziyi Chen", 8 "Ruizhong Qiu", 9 "Zhichen Zeng", 10 "Zhining Liu", 11 "Xuying Ning", 12 "Duo Zhou", 13 "Jingrui He" 14 ], 15 "year": 2026, 16 "venue": "arXiv preprint", 17 "arxiv_id": "2601.06022" 18 }, 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The abstract states 'The code is available at https://github.com/CCM0111/AdaFuse.' A working GitHub URL is provided." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "All benchmarks used (NaturalQuestions, SQuAD, TriviaQA, GSM8K, FLORES) are publicly available standard datasets. Appendix B states 'we use standard QA and translation datasets...without modifying their contents.'" 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": true, 34 "justification": "Appendix D.4 specifies the full software environment: PyTorch v2.4.1 (CUDA 12.1), HuggingFace Transformers v4.51.3, Tokenizers v0.21.0, Accelerate v1.6.0, and Datasets v3.0.2. This is sufficient to recreate the environment." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "While code is released and hyperparameters are documented in the paper, the paper does not include step-by-step reproduction instructions (e.g., commands to run experiments) in the main text or appendix. Appendix B.5 mentions 'clear module structure' with inline comments but this is a description of the repo, not reproduction steps." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "No confidence intervals or error bars are reported for any results in Table 1 or other results tables. Appendix D.3 explicitly states 'we report results from a deterministic decoding setting,' implying single-run point estimates only." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "No statistical significance tests are performed. The paper claims ADAFUSE 'consistently outperforms' baselines based solely on point estimate comparisons (e.g., 63.23 vs 59.16 average), with no p-values or hypothesis tests." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "The paper reports relative percentage improvements alongside raw scores (e.g., '+10.01%' on NQ, '+4.12%' on SQuAD) with baseline context provided, which is sufficient for effect size interpretation." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "The number of test examples per benchmark (e.g., 3610 NQ, 2500 SQuAD) is listed in Appendix C, but no justification is given for why these specific subset sizes were chosen or whether the samples are representative of the full benchmark." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "Appendix D.3 states results are from a 'deterministic decoding setting,' indicating single-run results. No standard deviation, variance, or IQR is reported across multiple runs." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Four representative ensemble baselines are included: LLM-BLENDER, DEEPEN, SWEETSPAN, and UNITE, covering sample-level, span-level, and token-level ensemble approaches." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "The baselines are all recent: LLM-BLENDER (2023), DEEPEN (2024), SWEETSPAN (2024), and UNITE (2024). These represent the current state of the art in LLM ensemble methods." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "Section 4.3 presents an ablation study (RQ2) replacing adaptive word commitment with fixed-length word commitment of lengths 1, 2, or 3. Section 4.4 ablates the diversity-aware ensemble scaling component. Appendix F compares diversity-aware scaling vs. beam-search scaling." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper uses exact match accuracy for QA tasks, answer accuracy for GSM8K, and spBLEU for machine translation. It also mentions BERTScore as an additional semantic similarity metric (Appendix D.4)." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "The paper includes machine translation evaluation (FLORES En-De, De-En) where BLEU scores are known to correlate imperfectly with human translation quality judgments. Human evaluation is relevant for assessing translation output quality and is commonly used in MT research. The paper did not include any human evaluation. For the QA and math tasks, automated metrics (exact match, accuracy) are sufficient, but the inclusion of translation tasks makes human evaluation applicable." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper evaluates on standard held-out test splits of established benchmarks (NQ, SQuAD, TriviaQA, GSM8K, FLORES). Hyperparameter sensitivity analysis (confidence threshold τ∆) is conducted on NQ separately, and the chosen value is then applied uniformly to all datasets." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table 1 reports per-benchmark scores across six benchmarks covering three task categories (knowledge-intensive QA, arithmetic reasoning, machine translation). The paper also analyzes performance by fusion granularity in Section 4.2." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 4.2 acknowledges that on GSM8K with the fixed base pair (LLaMA + Mistral), ADAFUSE scores 79.15, below LLaMA alone (81.05), attributing this to 'large performance disparity between the underlying base models.' Table 2 shows adding Qwen3-8B as a 4th model causes a performance drop. The Limitations section acknowledges applicability is restricted to open-source models." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper reports that adding a 4th model (Qwen3-8B) to the ensemble leads to a performance drop on NQ (Table 2). The Limitations section notes ADAFUSE cannot be applied to closed-source APIs. Appendix F shows beam-search scaling degrades performance." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The abstract claims 'an average relative improvement of 6.88%' over strong ensemble baselines, which is precisely reported in Table 1 improvement row (average of the six per-task relative improvements). The specific tasks mentioned (open-domain QA, arithmetic reasoning, machine translation) are all tested." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "The ablation studies (Section 4.3, RQ2) isolate the adaptive word commitment component by replacing it with fixed-length alternatives while keeping other components constant, constituting controlled single-variable manipulation. This supports the causal claim that adaptive commitment improves performance." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper's title 'AdaFuse: Adaptive Ensemble Decoding with Test-Time Scaling for LLMs' suggests general applicability to LLMs. The abstract claims ADAFUSE 'consistently outperforms strong ensemble baselines' without bounding this to the tested setting. Results cover only 4 open-source models of ~7-8B parameters on 6 benchmarks across 3 task types. While the Limitations section notes the method requires open-source models with token-level likelihoods, it does not explicitly bound generalization claims to the tested model sizes (~8B), tested task types (QA, math, translation), or the number of languages tested. The conclusion speaks broadly of 'diverse tasks and evolving generation contexts.' Per the schema, a broad title with narrow evaluation warrants answer=false." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper does not substantively discuss alternative explanations for its results. For instance, it attributes GSM8K underperformance to 'large performance disparity between base models' but does not consider other explanations (e.g., task-specific formatting issues, different tokenization). The Limitations section addresses applicability constraints but not alternative explanations for observed results." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific model versions are stated: LLaMA-3.1-8B-Instruct, Mistral-7B-Instruct-v0.3, Qwen3-8B, and InternLM3-8B-Instruct. These are specific enough to identify the exact model checkpoints used." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper describes that few-shot prompting is used (e.g., '5-shot' for NQ, SQuAD, TriviaQA; '4-shot with chain-of-thought prompting' for GSM8K; '0-shot' for FLORES) but does not provide the actual prompt text or the few-shot examples used. No prompts are shown in the main text or appendix." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Key hyperparameters are reported: confidence threshold τ∆=0.7, maximum committed words M=3, branching factor B, and context length limit of up to 512 tokens (Section 4.1 and Appendix D.2). A sensitivity analysis for τ∆is provided in Figure 6." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "ADAFUSE is a decoding algorithm, not an agentic scaffolding system. No agentic scaffolding (tools, retry logic, memory management) is used." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper reports using standard benchmark splits but does not describe any preprocessing steps applied to the benchmarks. Appendix C provides descriptive statistics but does not document filtering criteria or preprocessing pipeline." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "There is a dedicated 'Limitations' section after the Conclusion that discusses the key limitation of requiring access to token-level likelihoods, restricting ADAFUSE to open-source models." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "The Limitations section contains only one specific limitation (black-box API incompatibility). It does not discuss threats specific to the evaluation such as benchmark contamination risk, the representativeness of the 6 chosen benchmarks, sensitivity to the chosen base model pair, or whether results generalize beyond the tested model sizes (~7-8B parameters)." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "While the Limitations section notes the method requires open-source models with accessible token probabilities, the paper does not explicitly state what its results do NOT show — e.g., it does not clarify that results may not generalize to larger models, closed-source models, non-English tasks beyond German, or other task types not tested." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "All benchmarks used are publicly available standard datasets (NaturalQuestions, SQuAD, TriviaQA, GSM8K, FLORES). Independent researchers can verify results by downloading these datasets directly." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "The paper uses publicly available benchmarks and references their original papers. Appendix C provides descriptive statistics (number of entries, average question/answer lengths) for each dataset." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants are involved. The paper uses pre-existing benchmark datasets. Recruitment methods are not applicable." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The paper does not document how benchmark subsets were selected (e.g., SQuAD 2500 entries — how were these 2500 selected from the full SQuAD dataset?). No filtering criteria for subset selection are described." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No acknowledgments section mentioning grants, funding sources, or corporate sponsors is present in the paper. There is no mention of any funding source." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All authors are affiliated with the University of Illinois Urbana-Champaign, as stated on the title page. The paper does not evaluate any product made by the authors' institution." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": false, 217 "answer": false, 218 "justification": "No funding source is disclosed; applies=false because the question of funder independence cannot be assessed when no funder is identified." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "There is no competing interests statement in the paper. The absence of disclosure does not confirm absence of conflict." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper uses LLaMA-3.1-8B-Instruct, Mistral-7B-Instruct-v0.3, Qwen3-8B, and InternLM3-8B-Instruct to evaluate on standard benchmarks, but does not state the training data cutoff dates for any of these models." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "The paper does not discuss potential train/test overlap. Benchmarks like NaturalQuestions, SQuAD, and TriviaQA are well-established datasets that predate the training cutoff of the tested models, making contamination a real concern that is not addressed." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "NaturalQuestions (2019), SQuAD (2016), TriviaQA (2017), and GSM8K (2021) were all published before the likely training cutoffs of the models used. The paper makes no mention of contamination risk or any analysis thereof." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants are involved in this study. The paper evaluates an automated decoding algorithm on NLP benchmarks." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are involved. IRB approval is not applicable." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants are involved." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants are involved." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants are involved." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants are involved." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants are involved." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Section 4.5 reports wall-clock inference time for ADAFUSE vs. baselines using 4x NVIDIA A100 80GB GPUs on NQ dataset (Figure 5). Appendix A discusses GPU energy consumption tradeoffs." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Appendix D.1 states 'The total compute budget for all experiments is approximately 500 A100 GPU-hours.' Hardware specs (NVIDIA A100 80GB) and model sizes (~7-8B parameters, ~30GB GPU memory in 16-bit precision) are also specified." 290 } 291 } 292 }, 293 "claims": [ 294 { 295 "claim": "ADAFUSE achieves an average relative improvement of 6.88% over the strongest ensemble baseline (SWEETSPAN) across six benchmarks.", 296 "evidence": "Table 1 shows ADAFUSE (Fixed Base) scores 63.23 average vs. SWEETSPAN's 59.16, with per-task relative improvements of +10.01% (NQ), +4.12% (SQuAD), +0.97% (TriviaQA), +17.03% (GSM8K), +6.04% (Flores En-De), +5.60% (Flores De-En).", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Adaptive word commitment outperforms fixed-length word commitment (lengths 1, 2, or 3) on Natural Questions.", 301 "evidence": "Figure 2 in Section 4.3 shows that fixed-length decoding at all word lengths consistently underperforms adaptive ADAFUSE on the NQ dataset.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Diversity-aware ensemble scaling consistently improves performance across five benchmarks when enabled.", 306 "evidence": "Figure 4 (left) in Section 4.4 compares ADAFUSE with and without diversity-aware scaling, showing consistent improvement across benchmarks. Increasing branching factor B from 1 to 5 shows generally upward trend (Figure 4 right).", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "ADAFUSE achieves comparable runtime to UniTE and is faster than SweetSpan and DeepEn.", 311 "evidence": "Figure 5 in Section 4.5 reports wall-clock inference time on NQ under a standardized setting (batch size 1, 4x A100 80GB GPUs), showing ADAFUSE's runtime is comparable to UniTE and substantially faster than SweetSpan and DeepEn.", 312 "supported": "moderate" 313 }, 314 { 315 "claim": "Base model compatibility is a critical factor in ensemble effectiveness, as demonstrated by GSM8K results.", 316 "evidence": "Section 4.2 reports ADAFUSE (Fixed Base) scores 79.15 on GSM8K, below LLaMA-3.1-8B-Instruct alone (81.05), but ADAFUSE with Qwen3-8B + LLaMA (Top-2 oracle) achieves 90.25. The authors attribute the gap to large performance disparity between Mistral and LLaMA on GSM8K.", 317 "supported": "moderate" 318 } 319 ], 320 "methodology_tags": [ 321 "benchmark-eval" 322 ], 323 "key_findings": "ADAFUSE is an adaptive word-level ensemble decoding framework that combines multiple LLMs by dynamically deciding when to commit multi-word spans based on model confidence, and invoking diversity-aware candidate exploration under uncertainty. Evaluated on 6 benchmarks across 3 task types using 4 open-source models (~7-8B parameters), ADAFUSE achieves an average relative improvement of 6.88% over the strongest baseline ensemble method (SWEETSPAN). The method requires token-level likelihoods and is thus limited to open-source models, though it achieves competitive runtime compared to simpler ensemble methods by reducing total decoding rounds through confident span extension.", 324 "red_flags": [ 325 { 326 "flag": "No statistical uncertainty quantification", 327 "detail": "All results in Table 1 are from a single deterministic decoding run with no error bars, standard deviations, or confidence intervals. With average improvements of only a few percentage points (e.g., +0.97% on TriviaQA), it is impossible to assess whether these differences are meaningful or within noise." 328 }, 329 { 330 "flag": "Benchmark contamination not discussed", 331 "detail": "The paper evaluates pre-trained LLMs on NaturalQuestions (2019), SQuAD (2016), TriviaQA (2017), and GSM8K (2021), all of which predate the likely training cutoffs of the tested models. No contamination analysis or discussion is provided, which could inflate absolute performance numbers and obscure whether ensemble gains are real." 332 }, 333 { 334 "flag": "Prompts not disclosed", 335 "detail": "The paper uses 5-shot prompting for QA tasks and 4-shot chain-of-thought for GSM8K but provides no prompt text or few-shot examples. Prompt choice significantly affects LLM performance, making results non-reproducible without this information." 336 }, 337 { 338 "flag": "Oracle result misleadingly included", 339 "detail": "Table 1 includes 'ADAFUSE (Top-2 Base)' which uses an oracle selection of the two best-performing base models per task. This result (65.12 avg) is bolded as the 'overall best result' but requires knowing which models perform best in advance — an impractical assumption in real deployment." 340 }, 341 { 342 "flag": "Funding not disclosed", 343 "detail": "No acknowledgments or funding disclosure is present. This is a minor concern for an academic paper from a university lab, but the omission is notable." 344 } 345 ], 346 "cited_papers": [ 347 { 348 "title": "LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion", 349 "authors": [ 350 "Dongfu Jiang", 351 "Xiang Ren", 352 "Bill Yuchen Lin" 353 ], 354 "year": 2023, 355 "arxiv_id": "2306.02561", 356 "relevance": "A key baseline ensemble method that trains a pairwise ranker and generative fusion model to combine LLM outputs, directly compared against in the AdaFuse evaluation." 357 }, 358 { 359 "title": "Determine-Then-Ensemble: Necessity of Top-k Union for Large Language Model Ensembling", 360 "authors": [ 361 "Yuxuan Yao", 362 "Han Wu", 363 "Mingyang Liu", 364 "Sichun Luo", 365 "Xiongwei Han", 366 "Jie Liu", 367 "Zhijiang Guo", 368 "Linqi Song" 369 ], 370 "year": 2024, 371 "arxiv_id": "2410.03777", 372 "relevance": "The UniTE baseline method for token-level LLM ensemble decoding using top-k union filtering, directly compared against in AdaFuse experiments." 373 }, 374 { 375 "title": "Hit the Sweet Spot! Span-Level Ensemble for Large Language Models", 376 "authors": [ 377 "Yangyifan Xu", 378 "Jianghao Chen", 379 "Junhong Wu", 380 "Jiajun Zhang" 381 ], 382 "year": 2024, 383 "arxiv_id": "2409.18583", 384 "relevance": "The SweetSpan baseline for span-level LLM ensemble decoding via perplexity-based scoring, the strongest competitor to AdaFuse in the evaluation." 385 }, 386 { 387 "title": "Ensemble Learning for Heterogeneous Large Language Models with Deep Parallel Collaboration", 388 "authors": [ 389 "Yichong Huang", 390 "Xiaocheng Feng", 391 "Baohang Li", 392 "Yang Xiang", 393 "Hui Wang", 394 "Ting Liu", 395 "Bing Qin" 396 ], 397 "year": 2024, 398 "relevance": "The DeePEn baseline for token-level ensemble decoding using vocabulary alignment and relative representation modeling, compared against in AdaFuse experiments." 399 }, 400 { 401 "title": "Cool-Fusion: Fuse Large Language Models without Training", 402 "authors": [ 403 "Cong Liu", 404 "Xiaojun Quan", 405 "Yan Pan", 406 "Liang Lin", 407 "Weigang Wu", 408 "Xu Chen" 409 ], 410 "year": 2024, 411 "arxiv_id": "2407.19807", 412 "relevance": "A span-level ensemble method that merges model outputs at common word boundaries, related to AdaFuse's adaptive word-level ensembling approach." 413 }, 414 { 415 "title": "The LLaMA 3 Herd of Models", 416 "authors": [ 417 "Aaron Grattafiori", 418 "Abhimanyu Dubey" 419 ], 420 "year": 2024, 421 "arxiv_id": "2407.21783", 422 "relevance": "Technical report for LLaMA-3.1-8B-Instruct, one of the primary base models used in AdaFuse experiments." 423 }, 424 { 425 "title": "Training Verifiers to Solve Math Word Problems", 426 "authors": [ 427 "Karl Cobbe", 428 "Vineet Kosaraju", 429 "Mohammad Bavarian" 430 ], 431 "year": 2021, 432 "arxiv_id": "2110.14168", 433 "relevance": "GSM8K benchmark paper for arithmetic reasoning evaluation, one of the six benchmarks used to evaluate AdaFuse." 434 }, 435 { 436 "title": "Natural Questions: A Benchmark for Question Answering Research", 437 "authors": [ 438 "Tom Kwiatkowski", 439 "Jennimaria Palomaki", 440 "Olivia Redfield", 441 "Michael Collins" 442 ], 443 "year": 2019, 444 "relevance": "NaturalQuestions benchmark used as the primary open-domain QA evaluation dataset in AdaFuse experiments." 445 }, 446 { 447 "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models", 448 "authors": [ 449 "Xuezhi Wang", 450 "Jason Wei", 451 "Dale Schuurmans", 452 "Quoc Le", 453 "Ed Chi", 454 "Sharan Narang", 455 "Aakanksha Chowdhery", 456 "Denny Zhou" 457 ], 458 "year": 2023, 459 "arxiv_id": "2203.11171", 460 "relevance": "Foundational work on test-time scaling through self-consistency, directly related to AdaFuse's diversity-aware ensemble scaling component." 461 }, 462 { 463 "title": "FrugalGPT: How to Use Large Language Models while Reducing Cost and Improving Performance", 464 "authors": [ 465 "Lingjiao Chen", 466 "Matei Zaharia", 467 "James Zou" 468 ], 469 "year": 2023, 470 "arxiv_id": "2305.05176", 471 "relevance": "Work on LLM routing and cost-performance tradeoffs, relevant to the ensemble and routing approaches contextualized in AdaFuse." 472 }, 473 { 474 "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models", 475 "authors": [ 476 "Shunyu Yao", 477 "Dian Yu", 478 "Jeffrey Zhao", 479 "Izhak Shafran", 480 "Tom Griffiths", 481 "Yuan Cao", 482 "Karthik Narasimhan" 483 ], 484 "year": 2023, 485 "relevance": "Influential test-time compute scaling method that AdaFuse's diversity-aware scaling strategy is related to and contrasted with." 486 }, 487 { 488 "title": "Harnessing Consistency for Robust Test-Time LLM Ensemble", 489 "authors": [ 490 "Zhichen Zeng", 491 "Qi Yu", 492 "Xiao Lin", 493 "Ruizhong Qiu", 494 "Xuying Ning", 495 "Tianxin Wei", 496 "Yuchen Yan", 497 "Jingrui He", 498 "Hanghang Tong" 499 ], 500 "year": 2025, 501 "arxiv_id": "2510.13855", 502 "relevance": "Related work on consistency-based LLM ensemble methods at test time, directly related to AdaFuse's adaptive ensembling approach." 503 } 504 ] 505 }