scan.json (20092B)
1 { 2 "paper": { 3 "title": "ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation", 4 "authors": ["Zhewei Yao", "Xiaoxia Wu", "Cheng Li", "Stephen Youn", "Yuxiong He"], 5 "year": 2023, 6 "venue": "arXiv", 7 "arxiv_id": "2303.08302" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "The paper states 'Code will be released as a part of https://github.com/microsoft/DeepSpeed' — a promise of future release counts as NO per schema guidance." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses publicly available datasets (Wikitext-2, PTB, C4) and publicly available model families (OPT, BLOOM)." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section listing library versions is provided." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions or README with commands to run are provided in the paper." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Results are reported as point estimates (perplexity values) with no confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper makes comparative claims about methods (e.g., 'GPTQ typically performs better') based solely on comparing perplexity numbers without any statistical significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports perplexity differences with baseline context throughout (e.g., Table 2 shows FP16 baseline alongside quantized values, and the Class system defines degradation thresholds of ≤0.1, >0.1 & ≤0.5, >0.5 PPL)." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification is given for why three datasets (Wikitext-2, PTB, C4) were chosen or why this is sufficient for the claims made." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper mentions in Table B.1 that 'varying the random seed has minimal impact' but does not report variance or standard deviation across runs for the main results. Single-run numbers are presented." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "RTN (round-to-nearest) is used as a baseline, and FP16/INT8 is used as a no-accuracy-loss baseline. Multiple PTQ methods (GPTQ, ZQ-Local, ZQ-Global) are compared." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "GPTQ (2022), ZeroQuant (2022), and SmoothQuant (2022) are contemporary methods at the time of writing (2023)." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Table 9 provides an ablation on the low-rank dimension m (1, 4, 8, 16, 32). The paper also systematically varies quantization block sizes, bit widths, and methods." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": false, 78 "justification": "Only perplexity (PPL) is used as the evaluation metric. No downstream task accuracy, F1, or other metrics are reported." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "Human evaluation is not relevant for a quantization methods comparison paper measuring perplexity." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper uses standard validation splits of Wikitext-2, PTB, and C4, which are separate from any calibration data used for quantization." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down per model size (125M to 176B), per model family (OPT vs BLOOM), per dataset (Wikitext-2, PTB, C4), and per quantization configuration." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper discusses OPT-66B's anomalous behavior, divergence issues with symmetric activation quantization on OPT models, and cases where quantization leads to 'exploded' perplexity (e.g., W2A16 on BLM-176b)." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that 'none of the current methods can achieve the original model quality for quantization with either INT4-weight or INT4-weight-and-INT8-activation' and that ZQ-Global underperforms GPTQ for larger models despite having more parameters." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract's three claims (sensitivity analysis, evaluation of PTQ methods, LoRC proposal) are all supported by corresponding sections and tables in the paper." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims like 'LoRC enhances model quality recovery' are supported by controlled ablation studies (Table 9) with single-variable manipulation (varying m while holding other factors constant)." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "The Limitation section explicitly states: 'caution should be exercised when generalizing these findings to tasks that are dissimilar to those covered in this study' and acknowledges the study was limited to two model families and three datasets." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper discusses alternative explanations for OPT-66B's poor quantization (dead neurons in early layers, Layer Norm issues) and provides eigenvalue analysis to explain why LoRC improvements plateau after m=8." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Specific model sizes are specified (OPT-125M through OPT-66B, BLOOM-560M through BLOOM-176B). These are open-source models with fixed weights, so version ambiguity is not an issue as with API-based models." 133 }, 134 "prompts_provided": { 135 "applies": false, 136 "answer": false, 137 "justification": "This paper does not use prompting — it measures perplexity on text datasets using quantized models." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper refers to Appendix B for GPTQ, ZQ-Local, and ZQ-Global hyperparameters, and discusses finding optimal configurations. Quantization parameters (block sizes, bit widths, symmetric/asymmetric) are detailed." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used in this work." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "The paper describes using zero-shot validation perplexity on three datasets, specifies per-row quantization for weights and per-token quantization for activations, and documents the quantization procedure in Appendix A." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 6 (Discussion) contains a dedicated 'Limitation' subsection discussing resource constraints, dataset choices, and generalizability caveats." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "The limitations are specific: 'we strategically limited our datasets to WikiText, PTB, and C4 to concentrate on a broad range of quantization methods' and acknowledge the trade-off between model diversity and task diversity." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The paper explicitly states: 'caution should be exercised when generalizing these findings to tasks that are dissimilar to those covered in this study' and notes the study was constrained to two model families." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "Raw experimental results beyond what is shown in tables and appendices are not made available for independent verification." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "The paper describes using standard public datasets (Wikitext-2, PTB, C4) and public model checkpoints (OPT, BLOOM), with quantization procedures documented." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants; data sources are standard public benchmarks and models." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The quantization pipeline is documented: model loading → quantization (with specified method and parameters) → perplexity evaluation on validation sets. Appendix A describes quantization background and Appendix B covers hyperparameters." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section is present in the paper. All authors are from Microsoft." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "All authors are clearly listed as being from Microsoft with their Microsoft email addresses." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "All authors are Microsoft employees. Microsoft has a financial interest in efficient LLM deployment (DeepSpeed is a Microsoft product). The funder is not independent of the outcome." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement is present in the paper. The code is released as part of Microsoft's DeepSpeed project." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "This paper evaluates quantization methods on pre-existing models using perplexity, not model capability on benchmarks. Contamination is not relevant — the evaluation measures compression quality, not whether the model 'knows' the answers." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "Not applicable — the paper measures perplexity degradation from quantization, not model capability on benchmarks." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "Not applicable — perplexity evaluation of quantization quality is not affected by train/test contamination in the same way as benchmark capability evaluation." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "The paper does not report the computational cost of running the quantization methods or inference latency of quantized models, despite this being directly relevant to the practical motivation." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "The paper mentions 'over 10,000 experiments' and being 'constrained by computing resources' but does not quantify GPU hours, hardware used, or total compute budget." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "INT8 weight-only quantization results in negligible accuracy loss (less than 0.05 PPL) across all tested models.", 286 "evidence": "Table 2 shows W8sym-A16 and W8asym-A16 results matching W16-A16 baselines within 0.05 PPL for all OPT and BLOOM models (Section 3).", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Activation quantization is more sensitive than weight quantization, and smaller models tolerate activation quantization better than larger models.", 291 "evidence": "Table 2 shows INT8 activation causes >0.5 PPL degradation for models >10B (Class-3) while smaller models remain Class-1 (Section 3).", 292 "supported": "strong" 293 }, 294 { 295 "claim": "No existing PTQ method achieves Class-1 (<0.1 PPL degradation) for INT4 weight-only or W4A8 quantization with per-row granularity.", 296 "evidence": "Table 3 shows all methods produce Class-2 or Class-3 degradation for W4A16 and W4A8 with per-row quantization (Section 4, Finding 2).", 297 "supported": "strong" 298 }, 299 { 300 "claim": "LoRC with fine-grained quantization nearly recovers original model quality for INT4 weight quantization on large models.", 301 "evidence": "Table 7 shows W4A16 with FGQ+LoRC achieves PPL within 0.09 of FP16 baseline for OPT-6.7b (11.99 vs 11.90) and within 0.03 for OPT-30b (10.70 vs 10.70) (Section 5).", 302 "supported": "strong" 303 }, 304 { 305 "claim": "The optimal LoRC dimension is approximately m=8, as eigenvalues of the error matrix flatten rapidly after index 8.", 306 "evidence": "Table 9 shows diminishing returns beyond m=4-8, and Figure 2 shows eigenvalue plots flattening after index 8 (Section 5).", 307 "supported": "strong" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "This paper provides a systematic comparison of post-training quantization methods (RTN, GPTQ, ZeroQuant variants) across OPT and BLOOM model families (125M-176B parameters). Key findings include that INT8 weight-only quantization is essentially lossless, activation quantization is harder than weight quantization (especially for larger models), and existing methods cannot achieve <0.1 PPL degradation for INT4 with per-row granularity. The proposed Low-Rank Compensation (LoRC) method uses SVD on the quantization error matrix to recover model quality with minimal size increase (~1.6%), and combined with fine-grained quantization, nearly recovers original FP16 quality for INT4 weight quantization on large models.", 312 "red_flags": [ 313 { 314 "flag": "Single metric evaluation", 315 "detail": "All results use only perplexity as the evaluation metric. No downstream task accuracy is reported, which limits understanding of how quantization affects actual model capability on real tasks." 316 }, 317 { 318 "flag": "No variance or error bars", 319 "detail": "Despite claiming over 10,000 experiments, no variance, standard deviation, or confidence intervals are reported for any results. The paper acknowledges fixed random seeds have minimal impact but does not demonstrate this systematically." 320 }, 321 { 322 "flag": "Company evaluating its own product", 323 "detail": "All authors are Microsoft employees, and the proposed LoRC method is released as part of Microsoft's DeepSpeed. The paper benchmarks against methods from other groups (GPTQ) while proposing an extension to Microsoft's own ZeroQuant line of work. No conflict of interest statement is provided." 324 } 325 ], 326 "cited_papers": [ 327 { 328 "title": "GPTQ: Accurate Post-Training Quantization for Generative Pre-Trained Transformers", 329 "authors": ["Elias Frantar", "Saleh Ashkboos", "Torsten Hoefler", "Dan Alistarh"], 330 "year": 2022, 331 "arxiv_id": "2210.17323", 332 "relevance": "Key baseline PTQ method for LLMs; foundational work in INT4 weight-only quantization." 333 }, 334 { 335 "title": "ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers", 336 "authors": ["Zhewei Yao", "Reza Yazdani Aminabadi", "Minjia Zhang", "Xiaoxia Wu", "Conglong Li", "Yuxiong He"], 337 "year": 2022, 338 "arxiv_id": "2206.01861", 339 "relevance": "Predecessor work proposing INT8 quantization with layer-by-layer distillation for LLMs." 340 }, 341 { 342 "title": "SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models", 343 "authors": ["Guangxuan Xiao", "Ji Lin", "Mickael Seznec", "Julien Demouth", "Song Han"], 344 "year": 2022, 345 "arxiv_id": "2211.10438", 346 "relevance": "Alternative PTQ approach that smooths activation outliers by migrating quantization difficulty to weights." 347 }, 348 { 349 "title": "LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale", 350 "authors": ["Tim Dettmers", "Mike Lewis", "Younes Belkada", "Luke Zettlemoyer"], 351 "year": 2022, 352 "arxiv_id": "2208.07339", 353 "relevance": "Mixed-precision INT8 quantization method for LLMs addressing activation outliers." 354 }, 355 { 356 "title": "The Case for 4-bit Precision: k-bit Inference Scaling Laws", 357 "authors": ["Tim Dettmers", "Luke Zettlemoyer"], 358 "year": 2022, 359 "arxiv_id": "2212.09720", 360 "relevance": "Scaling law analysis of weight-only quantization for LLMs." 361 }, 362 { 363 "title": "OPT: Open Pre-trained Transformer Language Models", 364 "authors": ["Susan Zhang", "Stephen Roller", "Naman Goyal"], 365 "year": 2022, 366 "arxiv_id": "2205.01068", 367 "relevance": "One of two model families evaluated; important open-source LLM suite." 368 }, 369 { 370 "title": "BLOOM: A 176B-Parameter Open-Access Multilingual Language Model", 371 "authors": ["Teven Le Scao", "Angela Fan", "Christopher Akiki"], 372 "year": 2022, 373 "arxiv_id": "2211.05100", 374 "relevance": "Second model family evaluated; largest open-access model at the time." 375 }, 376 { 377 "title": "A Survey of Quantization Methods for Efficient Neural Network Inference", 378 "authors": ["Amir Gholami", "Sehoon Kim", "Zhen Dong", "Zhewei Yao", "Michael W Mahoney", "Kurt Keutzer"], 379 "year": 2021, 380 "arxiv_id": "2103.13630", 381 "relevance": "Comprehensive survey of quantization methods for neural networks." 382 } 383 ] 384 }