scan.json (24950B)
1 { 2 "paper": { 3 "title": "COLT: Lightweight Multi-LLM Collaboration through Shared MCTS Reasoning for Model Compilation", 4 "authors": [ 5 "Annabelle Sujun Tang", 6 "Christopher Priebe", 7 "Lianhui Qin", 8 "Hadi Esmaeilzadeh" 9 ], 10 "year": 2026, 11 "venue": "Preprint", 12 "arxiv_id": "2602.01935" 13 }, 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper states 'Code is available at https://github.com/he-actlab/COLT' on page 1 (footnote)." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The benchmarks are drawn from publicly available models (Llama-3-8B, DeepSeek-R1, FLUX, Llama-4-Scout) and the framework is built on Apache TVM, a public framework. The computational kernels are standard layers extracted from public model architectures." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions Apache TVM v0.20.0 and an Intel Core i9 CPU and NVIDIA 2080 Ti GPU as hardware, but does not provide a requirements.txt, Dockerfile, or detailed dependency/version listing sufficient to recreate the environment." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README with commands to run or a 'Reproducing Results' section is described in the paper itself." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper reports mean performance over 10 runs (Section 4.1: 'We repeat each experiment 10 times and report the mean performance') but does not report confidence intervals or error bars on any figures or tables." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims COLT outperforms baselines (e.g., '20.8% improvement') but no statistical significance tests are reported. Comparisons are made by directly comparing mean speedup numbers without p-values or any statistical test." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports percentage improvements with baseline context. For example, 'COLT(8 Models) attains 14.98x final speedup, while GPT-5.2 only reaches 12.40x final speedup, a 20.8% improvement' (Section 4.2). Sample efficiency gains are also reported as ratios (e.g., 1.55x)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper uses 5 benchmarks and 10 repetitions per experiment but provides no justification for why these numbers were chosen or any power analysis. No discussion of whether 5 benchmarks or 10 runs is sufficient for the claims made." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Despite repeating each experiment 10 times, the paper reports only mean performance. No standard deviation, error bars, or any spread measure is reported in any table or figure." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Two single-model baselines are included: GPT-5.2 (single large) and gpt-5-mini (single small), using the same MCTS procedure with model selection disabled (Section 4.1)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The baselines use GPT-5.2 (2025) and gpt-5-mini, which are contemporary models. The paper also compares against Llama-3.3-70B-Instruct as an alternative largest model. Prior LLM-guided compiler works (Pan et al., 2025; Tang et al., 2025) are cited as related." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper includes ablation studies: Section 4.3 ablates the choice of largest model (GPT-5.2 vs Llama-3.3-70B-Instruct), Section 4.4 ablates the hardware platform (CPU vs GPU), and the scaling from 2 to 4 to 8 models serves as an implicit ablation of model set size." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper reports multiple metrics: execution latency/speedup, sample efficiency, sample efficiency gain, model invocation rate, and course alteration rate (Section 4.1)." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is not relevant for compiler optimization. The quality metric is execution latency, which is measured directly on hardware." 89 }, 90 "held_out_test_set": { 91 "applies": false, 92 "answer": false, 93 "justification": "This is not a learning task with train/test splits. The paper evaluates compiler optimization on specific computational kernels; there is no held-out test set concept applicable here." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down per benchmark (5 individual layers) across all figures and tables. Table 1 shows per-benchmark invocation rates. Tables 3-5 in the appendix provide per-layer, per-model call counts." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": false, 103 "justification": "The paper does not discuss specific failure cases or where the approach breaks down. No qualitative analysis of failed optimization trajectories or examples where COLT underperformed is provided." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": false, 108 "justification": "Every COLT configuration outperforms every single-model baseline across all benchmarks. No negative results, failed configurations, or approaches that were tried and abandoned are reported." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims COLT achieves 10.86x speedup on CPU (30.05x on GPU) and outperforms the single-model baseline with only 23.9% calls to the largest LLM. These are supported by Figure 2, Figure 4, and Table 1 in the results." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims about collaboration improving performance (e.g., 'collaboration enhances the quality of proposed transformations'). The ablation studies (varying model set size, largest model, hardware) provide controlled single-variable manipulations that support these claims, as all other search parameters are held constant." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper tests on 5 computational kernels from 4 model architectures on 2 hardware platforms, but the title and abstract claim broad applicability for 'model compilation' without bounding generalization. No discussion of whether results extend to other compiler frameworks, non-neural workloads, or other hardware targets beyond the two tested." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not discuss alternative explanations for why collaboration helps. For example, it does not consider whether the improvement comes simply from increased diversity of proposals (more random seeds) rather than from the specific MCTS-based collaboration mechanism, or whether increased total compute budget explains the gains." 131 } 132 }, 133 "setup_transparency": { 134 "model_versions_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper uses 'GPT-5.2' and 'gpt-5-mini' without specifying API versions or snapshot dates. Llama-3.3-70B-Instruct and other open-source models are named but without specific checkpoint identifiers or version hashes. Marketing names alone do not count as specified versions." 138 }, 139 "prompts_provided": { 140 "applies": true, 141 "answer": true, 142 "justification": "Appendix B provides the full system prompt and a detailed example of the user prompt including code, transformation history, predicted scores, search context, per-model stats, local model context, and the expected JSON output format." 143 }, 144 "hyperparameters_reported": { 145 "applies": true, 146 "answer": true, 147 "justification": "Section 4.1 reports key hyperparameters: lambda=0.5, exploration parameter c=sqrt(2), branching factor B=2. The LLM API parameters (temperature, top-p, etc.) are not reported, however the MCTS-specific hyperparameters are clearly stated." 148 }, 149 "scaffolding_described": { 150 "applies": true, 151 "answer": true, 152 "justification": "The MCTS-based scaffolding is described in detail throughout Sections 2-3, including tree policy (Section 3.2), contextual model selection (Section 3.3), and course alteration mechanism (Section 3.4). Figure 1 provides an overview diagram." 153 }, 154 "data_preprocessing_documented": { 155 "applies": true, 156 "answer": false, 157 "justification": "The paper does not describe how computational kernels are extracted from the production models, how TVM IRModules are constructed, or what preprocessing steps are applied before optimization begins." 158 } 159 }, 160 "limitations_and_scope": { 161 "limitations_section_present": { 162 "applies": true, 163 "answer": false, 164 "justification": "There is no limitations section. The paper has an 'Impact Statement' section that is a single paragraph about making compilation more efficient, but no substantive discussion of limitations or threats to validity." 165 }, 166 "threats_to_validity_specific": { 167 "applies": true, 168 "answer": false, 169 "justification": "No threats to validity are discussed anywhere in the paper." 170 }, 171 "scope_boundaries_stated": { 172 "applies": true, 173 "answer": false, 174 "justification": "The paper does not explicitly state what the results do NOT show. No discussion of which hardware targets, compiler frameworks, or workload types are excluded from the claims." 175 } 176 }, 177 "data_integrity": { 178 "raw_data_available": { 179 "applies": true, 180 "answer": false, 181 "justification": "Raw experimental data (per-run speedup values, optimization trajectories, timing measurements) are not available. Only aggregated means are reported in the paper." 182 }, 183 "data_collection_described": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section 4.1 describes the experimental setup: benchmarks are specific layers from named models, target hardware is specified (Intel Core i9, NVIDIA 2080 Ti), performance is measured as execution latency on target hardware, and each experiment is repeated 10 times." 187 }, 188 "recruitment_methods_described": { 189 "applies": false, 190 "answer": false, 191 "justification": "No human participants. Data sources are standard computational kernels from public model architectures." 192 }, 193 "data_pipeline_documented": { 194 "applies": true, 195 "answer": false, 196 "justification": "The pipeline from model architecture to computational kernel to TVM IRModule to optimization is not fully documented. How the initial programs are extracted and what preprocessing they undergo before entering the MCTS loop is not described." 197 } 198 }, 199 "conflicts_of_interest": { 200 "funding_disclosed": { 201 "applies": true, 202 "answer": false, 203 "justification": "No funding source is mentioned anywhere in the paper. There is no acknowledgments section listing grants or sponsors." 204 }, 205 "affiliations_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "All authors are listed as affiliated with University of California, San Diego (footnote 1). The paper uses OpenAI APIs (GPT-5.2, gpt-5-mini) but the authors are not affiliated with OpenAI." 209 }, 210 "funder_independent_of_outcome": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of funding disclosure is not the same as being unfunded." 214 }, 215 "financial_interests_declared": { 216 "applies": true, 217 "answer": false, 218 "justification": "No competing interests statement is present in the paper." 219 } 220 }, 221 "contamination": { 222 "training_cutoff_stated": { 223 "applies": false, 224 "answer": false, 225 "justification": "This paper does not evaluate pre-trained model capability on a benchmark. It uses LLMs as proposal generators within an optimization loop; the LLMs are not being tested on memorized knowledge but on their ability to propose compiler transformations in context." 226 }, 227 "train_test_overlap_discussed": { 228 "applies": false, 229 "answer": false, 230 "justification": "Same as above — the paper does not evaluate LLMs on a benchmark where memorization would be a concern. The LLMs are generating transformation proposals conditioned on current program state, not recalling benchmark answers." 231 }, 232 "benchmark_contamination_addressed": { 233 "applies": false, 234 "answer": false, 235 "justification": "Same as above — contamination is not a relevant concern for this type of system where LLMs are used as optimization proposal generators rather than being evaluated on benchmark knowledge." 236 } 237 }, 238 "human_studies": { 239 "pre_registered": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "irb_or_ethics_approval": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "demographics_reported": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "inclusion_exclusion_criteria": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "randomization_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "blinding_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "attrition_reported": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 } 274 }, 275 "cost_and_practicality": { 276 "inference_cost_reported": { 277 "applies": true, 278 "answer": false, 279 "justification": "The paper does not report API costs, tokens consumed, or wall-clock time for the optimization process despite calling GPT-5.2 and other models hundreds of times per experiment. Model invocation counts are reported (Tables 3-5) but not the associated dollar cost or latency." 280 }, 281 "compute_budget_stated": { 282 "applies": true, 283 "answer": false, 284 "justification": "No total computational budget is stated. The paper does not report total API spend, total wall-clock time for the experiments, or GPU hours used for hardware-side evaluation." 285 } 286 } 287 }, 288 "claims": [ 289 { 290 "claim": "COLT(8 Models) achieves 10.86x average speedup on CPU and 30.05x on GPU across five benchmarks, outperforming the single-large-model baseline (GPT-5.2).", 291 "evidence": "Figures 2 and 4 show speedup curves across five benchmarks on CPU and GPU. The abstract and Section 4.2 report these averaged numbers.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "COLT reduces calls to the largest LLM to 23.9% of total calls (including course alteration) while outperforming using only the largest model.", 296 "evidence": "Table 1 shows GPT-5.2 accounts for 12.3% of regular invocations and 23.9% total (including course alteration) for COLT(8 Models) on CPU.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Performance improves monotonically as the candidate model set is expanded from 2 to 4 to 8 models.", 301 "evidence": "Figure 2 shows COLT(8 Models) >= COLT(4 Models) >= COLT(2 Models) in final speedup across all five CPU benchmarks.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "COLT is robust to the choice of largest model, working with both proprietary GPT-5.2 and open-source Llama-3.3-70B-Instruct.", 306 "evidence": "Section 4.3 and Figure 3 show COLT(8 Models) achieves the best final speedup on every benchmark with Llama-3.3-70B-Instruct as largest model, improving over the baseline by 4.8-27.5%.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "COLT generalizes across hardware platforms (CPU and GPU).", 311 "evidence": "Section 4.4 and Figure 4 show COLT consistently outperforms single-model baselines on NVIDIA 2080 Ti GPU.", 312 "supported": "moderate" 313 }, 314 { 315 "claim": "Collaboration enhances the quality of proposed transformations compared to a single model, with large early-budget gains.", 316 "evidence": "Section 4.2: 'on FLUX Convolution Layer at 100 samples, COLT(8 Models) reaches 3.83x speedup, a 49.6% improvement over the single-large-model baseline.' Similar early gains are shown in other benchmarks.", 317 "supported": "weak" 318 } 319 ], 320 "methodology_tags": [ 321 "benchmark-eval" 322 ], 323 "key_findings": "COLT proposes a lightweight multi-LLM collaboration framework for compiler optimization that embeds model selection within a shared MCTS tree, enabling smaller models to drive most optimization decisions while selectively escalating to larger models. Across five computational kernels on CPU and GPU hardware, COLT consistently outperforms single-model baselines, achieving up to 10.86x average speedup on CPU with the largest model accounting for only 23.9% of total invocations. Performance scales monotonically with the number of collaborative models (2, 4, 8) and the approach is robust to substituting different largest models (GPT-5.2 vs Llama-3.3-70B-Instruct).", 324 "red_flags": [ 325 { 326 "flag": "No variance or uncertainty reporting despite 10 runs", 327 "detail": "The paper explicitly states each experiment is repeated 10 times but reports only mean values. No standard deviations, error bars, or confidence intervals are provided in any figure or table, making it impossible to assess whether the reported improvements are statistically meaningful." 328 }, 329 { 330 "flag": "No negative results", 331 "detail": "Every COLT configuration outperforms every baseline on every benchmark. No failed configurations, approaches that didn't work, or settings where collaboration hurt performance are reported, which is suspicious for a system with multiple hyperparameters and design choices." 332 }, 333 { 334 "flag": "No limitations discussion", 335 "detail": "The paper lacks any limitations section, threats-to-validity discussion, or explicit scoping of what the results do not show. The Impact Statement is a single paragraph that does not address limitations." 336 }, 337 { 338 "flag": "Missing API cost analysis", 339 "detail": "The paper's central claim is about lightweight collaboration, but no API costs or wall-clock times are reported. Without knowing the cost of hundreds of GPT-5.2 calls per experiment, it is impossible to assess practical viability or compare against the cost of simply running the single-large-model baseline longer." 340 }, 341 { 342 "flag": "Narrow benchmark coverage", 343 "detail": "Results are from 5 computational kernels (attention, MoE, convolution, MLP layers) on only 2 hardware platforms, yet the claims are framed broadly for 'compiler optimization' and 'model compilation' without bounding generalization." 344 } 345 ], 346 "cited_papers": [ 347 { 348 "title": "Large language models for compiler optimization", 349 "authors": ["C. Cummins", "V. Seeker", "D. Grubisic", "M. Elhoushi", "Y. Liang", "B. Roziere", "J. Gehring", "F. Gloeckle", "K. Hazelwood", "G. Synnaeve", "H. Leather"], 350 "year": 2023, 351 "relevance": "Foundational work on using LLMs for compiler optimization, establishing the paradigm that COLT extends to multi-model settings." 352 }, 353 { 354 "title": "LLM Compiler: Foundation language models for compiler optimization", 355 "authors": ["C. Cummins", "V. Seeker", "D. Grubisic", "B. Roziere", "J. Gehring", "G. Synnaeve", "H. Leather"], 356 "year": 2025, 357 "relevance": "Trains foundation models specifically for compiler optimization, representing the single-large-model approach that COLT aims to outperform." 358 }, 359 { 360 "title": "CompilerDream: Learning a compiler world model for general code optimization", 361 "authors": ["C. Deng", "J. Wu", "N. Feng", "J. Wang", "M. Long"], 362 "year": 2025, 363 "relevance": "Uses world models for compiler optimization, another approach to LLM-guided compilation that COLT builds upon." 364 }, 365 { 366 "title": "Compiler-R1: Towards agentic compiler auto-tuning with reinforcement learning", 367 "authors": ["H. Pan", "H. Lin", "H. Luo", "Y. Liu", "K. Yao", "L. Zhang", "M. Xing", "Y. Wu"], 368 "year": 2025, 369 "relevance": "Applies reinforcement learning to agentic compiler optimization, directly relevant as a prior single-LLM approach for compilation." 370 }, 371 { 372 "title": "REASONING COMPILER: LLM-guided optimizations for efficient model serving", 373 "authors": ["S. Tang", "C. Priebe", "R. Mahapatra", "L. Qin", "H. Esmaeilzadeh"], 374 "year": 2025, 375 "relevance": "Prior work by the same group on LLM-guided compiler optimization using MCTS, the direct predecessor to COLT." 376 }, 377 { 378 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 379 "authors": ["L. Chen", "M. Zaharia", "J. Zou"], 380 "year": 2023, 381 "relevance": "Pioneering work on cost-aware LLM usage through cascades and model selection, directly relevant to COLT's multi-model cost reduction goal." 382 }, 383 { 384 "title": "Hybrid LLM: Cost-efficient and quality-aware query routing", 385 "authors": ["D. Ding", "A. Mallick", "C. Wang", "R. Sim", "S. Mukherjee", "V. Ruhle", "L. V. Lakshmanan", "A. H. Awadallah"], 386 "year": 2024, 387 "relevance": "Addresses cost-efficient model routing between different LLMs, a core challenge that COLT solves differently via endogenous MCTS-based selection." 388 }, 389 { 390 "title": "Improving factuality and reasoning in language models through multiagent debate", 391 "authors": ["Y. Du", "S. Li", "A. Torralba", "J. B. Tenenbaum", "I. Mordatch"], 392 "year": 2024, 393 "relevance": "Multi-LLM collaboration through debate, representing the explicit-communication approach that COLT's implicit shared-tree collaboration contrasts with." 394 }, 395 { 396 "title": "SLM-MUX: Orchestrating small language models for reasoning", 397 "authors": ["C. Wang", "Z. Wan", "H. Kang", "E. Chen", "Z. Xie", "T. Krishna", "V. J. Reddi", "Y. Du"], 398 "year": 2026, 399 "relevance": "Contemporary work on orchestrating small LLMs for reasoning tasks, directly relevant to COLT's approach of using primarily small models." 400 }, 401 { 402 "title": "Mixture-of-agents enhances large language model capabilities", 403 "authors": ["J. Wang", "J. Wang", "B. Athiwaratkun", "C. Zhang", "J. Zou"], 404 "year": 2025, 405 "relevance": "Multi-LLM collaboration via mixture-of-agents, relevant as an alternative collaboration paradigm to COLT's shared-tree approach." 406 }, 407 { 408 "title": "Agentic AI: a comprehensive survey of architectures, applications, and future directions", 409 "authors": ["M. Abou Ali", "F. Dornaika", "J. Charafeddine"], 410 "year": 2025, 411 "relevance": "Survey of agentic AI architectures, relevant context for COLT's claim of being lightweight compared to conventional agentic machinery." 412 }, 413 { 414 "title": "A unified approach to routing and cascading for LLMs", 415 "authors": ["J. Dekoninck", "M. Baader", "M. Vechev"], 416 "year": 2025, 417 "relevance": "Unified framework for LLM routing and cascading, directly relevant to COLT's model selection mechanism." 418 } 419 ] 420 }