scan.json (27403B)
1 { 2 "paper": { 3 "title": "SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models", 4 "authors": [ 5 "Guangxuan Xiao", 6 "Ji Lin", 7 "Mickael Seznec", 8 "Hao Wu", 9 "Julien Demouth", 10 "Song Han" 11 ], 12 "year": 2022, 13 "venue": "International Conference on Machine Learning (ICML 2023)", 14 "arxiv_id": "2211.10438", 15 "doi": "10.48550/arXiv.2211.10438" 16 }, 17 "scan_version": 2, 18 "active_modules": ["experimental_rigor", "data_leakage"], 19 "methodology_tags": ["benchmark-eval"], 20 "key_findings": "SmoothQuant proposes a mathematically equivalent per-channel scaling transformation that migrates quantization difficulty from activations to weights, enabling W8A8 quantization for LLMs up to 530B parameters with negligible accuracy loss. Integrated into PyTorch and FasterTransformer, SmoothQuant achieves up to 1.56× inference speedup and nearly 2× memory reduction compared to FP16, while methods like LLM.int8() that preserve accuracy introduce latency overhead. The method works across diverse model families (OPT, BLOOM, GLM-130B, MT-NLG, LLaMA, Llama-2, Falcon, Mistral, Mixtral) and enables serving a 530B model within a single 8-GPU node.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "GitHub repository URL provided in the paper header: https://github.com/mit-han-lab/smoothquant." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "All datasets used are publicly available standard benchmarks (LAMBADA, HellaSwag, PIQA, WinoGrande, OpenBookQA, RTE, COPA, WikiText, MMLU, MNLI, QNLI) and models are publicly available (OPT, BLOOM, GLM-130B, LLaMA)." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "The paper mentions NVIDIA A100 80GB GPUs, PyTorch/Huggingface, FasterTransformer, and CUTLASS INT8 GEMM kernels, but provides no requirements.txt, Dockerfile, or detailed dependency versions." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper describes the method and implementation frameworks but provides no step-by-step reproduction instructions, README commands, or scripts to replicate the experiments." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "All results in Tables 3-10 and Figures 7-9 are reported as point estimates with no confidence intervals or error bars." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "Claims like 'SmoothQuant maintains accuracy' and comparisons between methods are based solely on comparing point estimates with no statistical tests." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "Speedup factors (e.g., 1.51×, 1.56×), memory savings (1.96×), and accuracy changes are reported with full baseline context (e.g., FP16: 66.9% vs SmoothQuant-O3: 66.8% in Table 3)." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "No justification given for why 512 calibration sentences were used, or why these particular model sizes and benchmarks were selected." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "All results appear to be from single runs. No standard deviations, variance across seeds, or spread measures are reported." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "Four baselines compared: naive W8A8, ZeroQuant, LLM.int8(), and Outlier Suppression (Table 2). FP16 is also used as a reference." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "All baselines (ZeroQuant, LLM.int8(), Outlier Suppression) are from 2022, contemporary with the paper's submission." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "Figure 10 ablates the migration strength α, showing its effect on accuracy. Table 2/11 show three quantization scheme levels (O1-O3) with different accuracy-efficiency tradeoffs." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Accuracy on 7 zero-shot benchmarks, perplexity on WikiText, inference latency (ms), and GPU memory usage (GB) are all reported." 91 }, 92 "human_evaluation": { 93 "applies": false, 94 "answer": false, 95 "justification": "Human evaluation is irrelevant to the claims of maintaining numerical accuracy and improving inference efficiency of quantized models." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "Calibration uses 512 sentences from the Pile validation set, while evaluation is on separate benchmarks (LAMBADA, HellaSwag, PIQA, etc.). The paper explicitly notes: 'we calibrate the smoothing factors...once with 512 random sentences from the pre-training dataset Pile, and apply the same smoothed and quantized model for all downstream tasks.'" 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Table 3 shows per-benchmark results across 7 tasks plus WikiText perplexity. Figures 8-9 break down results by model size and sequence length." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "The paper discusses where baselines fail (W8A8/ZeroQuant/Outlier Suppression produce 'nearly random results' on OPT-175B, Table 3), where SmoothQuant-O3 degrades slightly (BLOOM-176B loses 0.8%, GLM-130B loses 1%), and extreme α values cause failures (Figure 10)." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Figure 10 shows α values that fail. SmoothQuant-O3 shows slight accuracy degradation on some models. They report that LLM.int8() is slower than FP16 despite preserving accuracy. Different models have different quantization difficulties requiring different α values." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "Abstract claims of 1.56× speedup (Figure 9), 2× memory reduction (Figures 8-9), negligible accuracy loss (Tables 3-7), and serving 530B within a single node (Table 10) are all supported by experimental results." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "The causal mechanism (migrating quantization difficulty via per-channel scaling) is supported by mathematical equivalence proof (Equations 3-4) and the α ablation study (Figure 10) which shows single-variable manipulation of the migration strength." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper tests on 8+ model families (OPT, BLOOM, GLM-130B, MT-NLG, LLaMA, Llama-2, Falcon, Mistral, Mixtral) at various scales, and explicitly notes that different models require different α values, bounding generalization claims." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "No alternative explanations for the results are discussed. The paper presents the smoothing transformation as the sole explanation without considering confounds or alternative interpretations." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper measures benchmark accuracy, perplexity, inference latency, and memory usage, and reports these directly without inflating them into broader claims. They explicitly state: 'we focus on the relative performance change before and after quantization but not the absolute value.'" 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": true, 149 "justification": "Specific model names with sizes are provided (OPT-6.7B through OPT-175B, BLOOM-176B, GLM-130B, MT-NLG 530B, LLaMA-7B/13B/30B/65B, Llama-2-7B/13B/70B, Falcon-7B/40B, Mistral-7B, Mixtral-8x7B). These are open-source models where the name uniquely identifies the checkpoint." 150 }, 151 "prompts_provided": { 152 "applies": false, 153 "answer": false, 154 "justification": "The paper does not use prompting. Evaluation is done via standard evaluation harnesses (lm-eval-harness) with zero-shot settings on benchmarks." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Migration strength α values reported per model family (0.5 for OPT/BLOOM, 0.75 for GLM-130B, 0.8-0.9 for LLaMA/Llama-2/Mistral/Mixtral, 0.6-0.7 for Falcon). Calibration details: 512 sentences from the Pile. Quantization schemes: O1/O2/O3 settings detailed in Table 2." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding is used. This is a quantization method paper." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Calibration process documented: 512 random sentences from the Pile pre-training dataset, used to estimate activation channel statistics and compute smoothing factors. The smoothing transformation is applied offline before deployment. For GLM-130B, top 2% token clipping is applied following Wei et al." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": false, 176 "justification": "No dedicated limitations section. The paper moves from related work directly to conclusion. Appendix A discusses weight-only quantization comparison limitations but is not a general limitations section." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": false, 181 "justification": "No threats to validity are discussed. The paper does not address potential limitations of the smoothing approach or conditions under which it might fail beyond noting different models need different α values." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": false, 186 "justification": "No explicit scope boundaries stated. The paper does not discuss what settings or model types SmoothQuant does NOT apply to, or what the results do not show." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": false, 193 "justification": "Only aggregated accuracy, perplexity, latency, and memory numbers are reported. No raw evaluation logs or per-example results are available for independent verification." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Data sources are clearly described: standard public benchmarks via lm-eval-harness and GLM-130B's official repo, calibration from 512 Pile sentences, hardware setup (NVIDIA A100 80GB GPUs)." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants. All data sources are standard public benchmarks and publicly available models." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "Pipeline documented: (1) collect 512 calibration sentences from Pile, (2) compute per-channel smoothing factors using Eq. 4, (3) fuse smoothing into model weights offline, (4) apply INT8 quantization, (5) evaluate on benchmarks using standard harnesses." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Acknowledgements list: MIT-IBM Watson AI Lab, MIT AI Hardware Program, Amazon and MIT Science Hub, NVIDIA Academic Partnership Award, Qualcomm Innovation Fellowship, Microsoft Turing Academic Program, and NSF." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations clearly stated: Guangxuan Xiao, Ji Lin, Song Han from MIT; Mickael Seznec, Hao Wu, Julien Demouth from NVIDIA." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": false, 225 "justification": "NVIDIA is both a funder (NVIDIA Academic Partnership Award) and employer of three co-authors. NVIDIA has direct commercial interest in efficient LLM inference on their GPUs (A100, INT8 GEMM, CUTLASS, Tensor Cores). The method is designed specifically for NVIDIA hardware." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests statement despite NVIDIA employees co-authoring a paper that specifically demonstrates performance on NVIDIA hardware and integrates into NVIDIA's FasterTransformer framework." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "No training data cutoff dates stated for any of the evaluated models (OPT, BLOOM, GLM-130B, etc.)." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": false, 242 "justification": "Not systematically discussed. The paper notes that 'some of the aforementioned benchmarks appear in the training set of GLM-130B' and uses different benchmarks for that model, but does not address overlap for OPT or BLOOM." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": false, 247 "justification": "Contamination is only partially addressed: GLM-130B gets different benchmarks due to known training set overlap, but no contamination analysis is performed for OPT or BLOOM models evaluated on benchmarks that may appear in their training data." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in this study." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in this study." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in this study." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": true, 291 "justification": "Inference latency (ms) and memory usage (GB) extensively reported across models and configurations in Figures 8-9, Tables 8 and 10. Includes context-stage and decoding-stage measurements." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "Hardware is stated (NVIDIA A100 80GB GPUs) but total computational budget (GPU hours for calibration, evaluation, and experiments) is not quantified." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "No mention of random seeds or variance across runs. All results appear to be single-run numbers." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": false, 308 "justification": "The number of experimental runs is never stated. Results are presented without indicating how many runs produced them." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "The paper mentions 'running a quick grid search on a subset of the Pile validation set' for α and shows an α sweep in Figure 10, but the exact number of configurations tried and compute spent on search is not stated." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": true, 318 "justification": "Selection of α=0.5 as default is justified mathematically (equal difficulty sharing, Sec. 4) and empirically (Figure 10, sweet spot on Pile validation subset). Model-specific adjustments (α=0.75 for GLM-130B, α=0.8-0.9 for LLaMA/Llama-2) are explained by activation outlier severity." 319 }, 320 "multiple_comparison_correction": { 321 "applies": false, 322 "answer": false, 323 "justification": "No statistical significance tests are performed, so multiple comparison correction is inapplicable." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The authors implement all baselines themselves (W8A8, ZeroQuant, Outlier Suppression) without acknowledging self-comparison bias or using independent implementations." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": true, 333 "justification": "Table 11 compares latency across quantization schemes (O1-O3, FP16, LLM.int8()). Tables 3-4 show accuracy at each efficiency level. Figures 8-9 compare latency and memory at matched and different GPU counts." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "Standard benchmarks (LAMBADA, HellaSwag, PIQA, etc.) are used without any discussion of whether they adequately measure model capability preservation under quantization." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No scaffolding is involved. This is a quantization method paper that evaluates models directly on benchmarks." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of whether benchmark datasets existed before the models' training data cutoffs." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of feature leakage in the evaluation setup." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of potential non-independence between training data and test benchmarks." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No concrete leakage detection or prevention method is used, except for switching benchmarks for GLM-130B due to known training set overlap." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "SmoothQuant maintains FP16 accuracy under W8A8 INT8 quantization for LLMs up to 530B parameters", 372 "evidence": "Tables 3-7 show negligible accuracy loss across OPT-175B (66.9% → 66.8% average), BLOOM-176B (68.2% → 67.4-68.4%), GLM-130B (73.8% → 72.5-73.7%), MT-NLG 530B (73.1% → 73.1%), LLaMA (11.51 → 11.56 PPL for 7B), and newer models in Table 7.", 373 "supported": "strong" 374 }, 375 { 376 "claim": "SmoothQuant achieves up to 1.56× inference speedup over FP16", 377 "evidence": "Figure 9 shows 1.56× speedup for OPT-13B at sequence length 128 in FasterTransformer. Figure 8 shows up to 1.51× speedup in PyTorch for OPT-30B. Table 8 shows up to 1.42× decoding-stage speedup.", 378 "supported": "strong" 379 }, 380 { 381 "claim": "SmoothQuant reduces GPU memory usage by approximately 2×", 382 "evidence": "Figure 8 shows 1.96× memory saving for OPT-30B in PyTorch. Figure 9 shows nearly 2× memory reduction in FasterTransformer across all OPT model sizes.", 383 "supported": "strong" 384 }, 385 { 386 "claim": "SmoothQuant enables serving a 530B model within a single 8-GPU node", 387 "evidence": "Table 10 shows MT-NLG 530B served on 8 A100 GPUs (527-570 GB) with INT8 vs 16 GPUs (1040-1095 GB) with FP16, at similar or better latency.", 388 "supported": "strong" 389 }, 390 { 391 "claim": "LLM.int8() introduces latency overhead despite preserving accuracy", 392 "evidence": "Table 11 shows LLM.int8() is slower than FP16 in most cases (e.g., 237.1ms vs 152.6ms for OPT-13B at seq 256). Figure 8 confirms LLM.int8() is 'almost always slower than the FP16 baseline.'", 393 "supported": "strong" 394 }, 395 { 396 "claim": "Migration strength α=0.5 is a general sweet spot for balancing quantization difficulty", 397 "evidence": "Figure 10 shows the α sweep on OPT-175B with LAMBADA, demonstrating a sweet spot at 0.4-0.6. However, this is only shown for one model on one benchmark; other models require different α values (0.75 for GLM-130B, 0.6-0.9 for newer models).", 398 "supported": "moderate" 399 }, 400 { 401 "claim": "Activation outliers in LLMs persist in fixed channels across tokens", 402 "evidence": "Figure 4 visualizes activation magnitudes in OPT-13B showing consistent outlier channels. Table 1 shows per-channel activation quantization preserves accuracy while per-token/per-tensor fail, confirming the channel-persistent pattern.", 403 "supported": "strong" 404 } 405 ], 406 "red_flags": [ 407 { 408 "flag": "No error bars or variance", 409 "detail": "All accuracy, perplexity, latency, and memory results are point estimates from apparent single runs. No standard deviations, confidence intervals, or variance across seeds are reported, making it impossible to assess result stability." 410 }, 411 { 412 "flag": "NVIDIA conflict of interest", 413 "detail": "Three co-authors (Seznec, Wu, Demouth) are NVIDIA employees. NVIDIA funds the research and has direct commercial interest in efficient LLM inference on their GPUs. The method is designed for and benchmarked on NVIDIA hardware (A100, CUTLASS, FasterTransformer). No competing interests statement is provided." 414 }, 415 { 416 "flag": "No limitations section", 417 "detail": "The paper has no dedicated limitations section and does not discuss conditions under which SmoothQuant might fail, degrade, or be inapplicable. This is unusual for a paper making broad generalization claims." 418 }, 419 { 420 "flag": "Self-implemented baselines", 421 "detail": "All baselines appear to be the authors' own implementations. No independent baseline implementations are used, and the potential for self-comparison bias is not acknowledged." 422 } 423 ], 424 "cited_papers": [ 425 { 426 "title": "LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale", 427 "authors": ["Tim Dettmers", "Mike Lewis", "Younes Belkada", "Luke Zettlemoyer"], 428 "year": 2022, 429 "arxiv_id": "2208.07339", 430 "relevance": "Key baseline for LLM quantization that uses mixed INT8/FP16 decomposition to handle activation outliers." 431 }, 432 { 433 "title": "ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers", 434 "authors": ["Zhewei Yao", "Reza Yazdani Aminabadi", "Minjia Zhang", "Xiaoxia Wu", "Conglong Li", "Yuxiong He"], 435 "year": 2022, 436 "arxiv_id": "2206.01861", 437 "relevance": "Baseline quantization method using per-token activation and group-wise weight quantization for LLMs." 438 }, 439 { 440 "title": "GPTQ: Accurate Post-Training Quantization for Generative Pre-Trained Transformers", 441 "authors": ["Elias Frantar", "Saleh Ashkboos", "Torsten Hoefler", "Dan Alistarh"], 442 "year": 2022, 443 "arxiv_id": "2210.17323", 444 "relevance": "Weight-only quantization method for LLMs, representing an orthogonal approach to SmoothQuant's weight-activation quantization." 445 }, 446 { 447 "title": "Language Models are Few-Shot Learners", 448 "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"], 449 "year": 2020, 450 "relevance": "GPT-3 paper establishing the scale of LLMs that necessitates quantization solutions." 451 }, 452 { 453 "title": "OPT: Open Pre-trained Transformer Language Models", 454 "authors": ["Susan Zhang", "Stephen Roller", "Naman Goyal"], 455 "year": 2022, 456 "arxiv_id": "2205.01068", 457 "relevance": "Primary evaluation model family for SmoothQuant, open-source LLMs up to 175B parameters." 458 }, 459 { 460 "title": "BLOOM: A 176B-Parameter Open-Access Multilingual Language Model", 461 "authors": ["Teven Le Scao", "Angela Fan", "Christopher Akiki"], 462 "year": 2022, 463 "arxiv_id": "2211.05100", 464 "relevance": "One of three major LLM families used to evaluate SmoothQuant at the 176B parameter scale." 465 }, 466 { 467 "title": "GLM-130B: An Open Bilingual Pre-trained Model", 468 "authors": ["Aohan Zeng", "Xiao Liu", "Zhengxiao Du"], 469 "year": 2022, 470 "arxiv_id": "2210.02414", 471 "relevance": "LLM with particularly difficult activation outliers, used to test SmoothQuant's robustness." 472 }, 473 { 474 "title": "Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B", 475 "authors": ["Shaden Smith", "Mostofa Patwary", "Brandon Norick"], 476 "year": 2022, 477 "arxiv_id": "2201.11990", 478 "relevance": "530B parameter model used to demonstrate SmoothQuant at extreme scale, enabling single-node serving." 479 }, 480 { 481 "title": "LLaMA: Open and Efficient Foundation Language Models", 482 "authors": ["Hugo Touvron", "Thibaut Lavril", "Gautier Izcard"], 483 "year": 2023, 484 "arxiv_id": "2302.13971", 485 "relevance": "Open LLM family with less severe activation outliers, extending SmoothQuant's evaluation scope." 486 }, 487 { 488 "title": "Outlier Suppression: Pushing the Limit of Low-bit Transformer Language Models", 489 "authors": ["Xiuying Wei", "Yunchen Zhang", "Xiangguo Zhang"], 490 "year": 2022, 491 "arxiv_id": "2209.13325", 492 "relevance": "Baseline method using non-scaling LayerNorm and token-wise clipping for activation outlier handling." 493 }, 494 { 495 "title": "Understanding and Overcoming the Challenges of Efficient Transformer Quantization", 496 "authors": ["Yelysei Bondarenko", "Markus Nagel", "Tijmen Blankevoort"], 497 "year": 2021, 498 "relevance": "Early work on per-channel activation quantization for transformers, demonstrating the accuracy preservation that motivated SmoothQuant's approach." 499 } 500 ] 501 }