scan.json (25770B)
1 { 2 "paper": { 3 "title": "OJBKQ: Objective-Joint Babai-Klein Quantization", 4 "authors": [ 5 "Xinyu Wang", 6 "Ziyu Zhao", 7 "Peng Lu", 8 "Yu Gu", 9 "Xiao-Wen Chang" 10 ], 11 "year": 2026, 12 "venue": "arXiv", 13 "arxiv_id": "2602.08376" 14 }, 15 "scan_version": 2, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "OJBKQ reformulates layer-wise post-training quantization as a box-constrained integer least squares problem and solves it using an extended Klein randomized Babai algorithm with K parallel candidate paths. Combined with the Joint Target Alignment (JTA) scoring objective that interpolates between runtime-quantized and full-precision references, OJBKQ consistently achieves lower perplexity and higher accuracy than GPTQ, AWQ, and QUIP at 3-4 bits across LLaMA, Qwen3, and Mistral models. The advantage is more pronounced at 3-bit settings where prior methods often fail catastrophically.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No code repository, GitHub link, or archive URL is provided anywhere in the paper." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "All evaluation datasets are publicly available standard benchmarks: C4, WikiText-2, ARC, BoolQ, HellaSwag, PIQA, WinoGrande, GSM8K, GPQA, MBPP. All models used are publicly released (LLaMA, Qwen3, Mistral)." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed environment setup section is provided. Hardware and software dependencies are not specified." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No step-by-step reproduction instructions, README, or scripts to replicate experiments are provided." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "All results in Tables 1-3 are point estimates with no confidence intervals, error bars, or uncertainty quantification." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper claims OJBKQ 'outperforms' and 'achieves the lowest' perplexity based on comparing raw numbers without any statistical significance tests." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Tables 1-3 present absolute performance numbers for both baselines and the proposed method side by side (e.g., GPTQ 9.41 vs Ours 9.33 on C4), providing sufficient context for the reader to assess the magnitude of improvements." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "The calibration set is 128 samples from C4 with sequence length 2048, but no justification is given for why 128 samples is sufficient." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "All results are single-run numbers with no standard deviation, variance, or spread measures reported. This is particularly concerning given the method uses randomized rounding (Klein's algorithm)." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper compares against RTN, GPTQ, AWQ, and QUIP as baselines (Section 4, Tables 1-3)." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "GPTQ (2023), AWQ (2024), and QUIP (2024) are contemporary state-of-the-art PTQ methods. The paper also references their most recent versions." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper ablates components incrementally: Ours(N) = Naive Babai only, Ours(R) = Random-K Babai, Ours = Random-K with JTA. Additional ablations on K (Figure 2) and µ/λ hyperparameters (Figure 3, Table 4) are provided." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Evaluation uses perplexity (C4, WikiText-2), zero-shot accuracy (ARC, BoolQ, HellaSwag, PIQA, WinoGrande), and reasoning accuracy (GSM8K, GPQA, MBPP)." 89 }, 90 "human_evaluation": { 91 "applies": false, 92 "answer": false, 93 "justification": "Human evaluation is not relevant for evaluating quantization method quality, which is measured by model performance on standard benchmarks." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": true, 98 "justification": "Calibration uses 128 samples from C4, while evaluation is performed on standard test splits of C4, WikiText-2, and other benchmarks. Zero-shot and reasoning tasks use the lm-harness library with standard evaluation protocols." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Results are broken down by model family (LLaMA-2, LLaMA-3, Qwen3, Mistral), model size (0.6B to 13B), bit-width (3-bit, 4-bit), group size (g128, no grouping), and individual benchmark tasks." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": false, 108 "justification": "The paper does not discuss cases where the method underperforms. For example, Ours(N) gives 47.23/42.75 on Q3-0.6B at 3-bit (worse than GPTQ's 43.60/41.54), but this is not discussed." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": false, 113 "justification": "No negative results are reported. Every presentation frames the method positively. Cases where the method is worse than baselines (e.g., Ours(N) on Qwen3-0.6B/4B at 3-bit) are present in tables but not acknowledged or discussed." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The abstract claims 'lower perplexity at 3–4 bits compared to existing PTQ approaches' — supported by Table 1 across most settings. 'Comparable computational cost' — supported by Figure 4 showing modest overhead at K=5." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "Causal claims ('Random-K expands the quality of the candidate set, while JTA improves how we select') are justified through the incremental ablation: Ours(N) → Ours(R) → Ours, which isolates each component's contribution via controlled single-variable manipulation." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": true, 130 "justification": "The abstract bounds claims to 'large language models' at '3–4 bits,' and the experiments test specific model families (LLaMA-2/3, Qwen3, Mistral) at those bit widths. The intro further scopes to 'standard and practically relevant 4-bit and 3-bit settings (e.g., group size 128).'" 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "No alternative explanations for the observed improvements are discussed. The paper does not consider whether improvements could be due to specific model architectures, calibration data selection, or other factors." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper measures perplexity and accuracy on standard benchmarks and claims improvements in quantization quality — no proxy gap exists. Claims match the granularity of measurements." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": true, 147 "justification": "Specific model names and sizes are provided: LLaMA-2-7B, LLaMA-2-13B, LLaMA-3-8B, Qwen3-0.6B, Qwen3-4B, Qwen3-8B, Mistral-7B. For open-weight models, the model name and size uniquely identify the checkpoint." 148 }, 149 "prompts_provided": { 150 "applies": false, 151 "answer": false, 152 "justification": "This is a weight quantization method that operates on model parameters, not through prompting." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": true, 157 "justification": "Key hyperparameters are reported: K=5, µ=0.6/λ=0.6 for 3-bit, µ=0.1/λ=0.2 for 4-bit, group size 128, calibration: 128 samples from C4 with seq length 2048, α formula given in Eq. 13." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "No agentic scaffolding is used. This is a numerical optimization method for weight quantization." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Calibration procedure is described: '128 samples from the C4 dataset with a sequence length of 2048 tokens.' Scale and zero-point computation uses 'standard statistical calibration methods (e.g., the Absmax method).' Evaluation uses lm-harness library." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "A dedicated 'Limitations' paragraph appears at the end of Section 5 (Conclusion), discussing specific gaps in the current framework." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": false, 179 "justification": "The limitations section lists method extensions (weight permutation, dynamic scaling, per-layer adaptive hyperparameters) rather than threats to the validity of the experimental conclusions. No discussion of whether results could be sensitive to calibration data choice, random seed, or other experimental factors." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "The limitations describe what the framework 'does not yet incorporate' (future work) rather than explicitly stating what the results do NOT show or what claims the authors are NOT making." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "No raw experimental data, logs, or intermediate results are made available for independent verification." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Data sources are clearly described: publicly available pre-trained models, C4 calibration samples, and standard evaluation benchmarks with references provided for each." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. Data sources are standard public benchmarks and pre-trained models." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The pipeline is documented: calibration data selection (128 C4 samples, 2048 tokens), quantization procedure (Absmax for scales/zeros, layer-wise JTA optimization), and evaluation protocol (perplexity computation, lm-harness for zero-shot/reasoning tasks)." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding sources or acknowledgments section is present in the paper." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly listed: McGill University and Université de Montréal. These are academic institutions not affiliated with the evaluated models." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "Funding is not disclosed, so independence cannot be assessed. Absence of disclosure is not evidence of independence." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests statement or financial disclosure is present in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "No training data cutoff dates are stated for any of the evaluated models (LLaMA-2/3, Qwen3, Mistral). These models were likely trained on data that includes many of the evaluation benchmarks." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": false, 240 "justification": "No discussion of whether the pre-trained models may have seen C4, WikiText-2, ARC, HellaSwag, or other evaluation benchmarks during training." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": false, 245 "justification": "WikiText-2, ARC, HellaSwag, PIQA, WinoGrande, and other benchmarks predate all evaluated models' training periods. No contamination risk is discussed." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "Figure 4 shows relative per-layer quantization time increase ratios for different K values, but no absolute wall-clock times, GPU hours, or inference latency numbers are reported." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "No total computational budget, GPU hours, hardware specifications, or total runtime for the experiments is stated anywhere in the paper." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "All results appear to be single-run. No seed sensitivity analysis is reported, despite the method using randomized rounding (Klein's algorithm) which should exhibit seed dependence." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "The number of experimental runs per result is never stated. Results are presented as single values without indicating whether they were averaged over multiple runs." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "Table 4 shows a grid search over µ and λ (10×8 = 80 configurations) and Figure 2 shows K ablation, but the total compute spent on hyperparameter search is not stated." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": true, 316 "justification": "The paper justifies selecting (µ=0.6, λ=0.6) over the global minimum (µ=0.6, λ=0.4): 'the configuration (µ=0.6, λ=0.6) achieves comparable performance (PPL 7.87) and lies within a stable low-perplexity region.' The full grid is shown in Table 4." 317 }, 318 "multiple_comparison_correction": { 319 "applies": false, 320 "answer": false, 321 "justification": "No statistical significance tests are performed, so there are no p-values to correct for multiple comparisons." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The authors implement their own method and compare against baselines using 'default configurations of each baseline's repo,' but do not acknowledge the bias of evaluating their own system or discuss whether their baseline implementations are competitive." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": true, 331 "justification": "Figure 2 shows perplexity vs K (number of candidates), and Figure 4 shows per-layer time increase ratio vs K. Together these show the performance-compute tradeoff for the key hyperparameter." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": false, 336 "justification": "No discussion of whether perplexity on C4/WikiText-2 or zero-shot accuracy on standard benchmarks actually measure the quality of quantization that matters for real-world deployment." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": false, 340 "answer": false, 341 "justification": "No scaffolding is involved. This is a weight quantization method evaluated on standard benchmarks." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the pre-trained models' training data includes information from the benchmark datasets, many of which predate the models." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether the evaluation setup leaks information. The paper does not address whether calibration data selection could influence results on related evaluation sets." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of independence between calibration data (C4 samples) and evaluation data (C4 test set). Both are drawn from the same corpus." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": false, 363 "justification": "No leakage detection or prevention method is used or discussed." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "OJBKQ achieves lower perplexity at 3-4 bits compared to existing PTQ approaches (GPTQ, AWQ, QUIP) across multiple LLM families.", 370 "evidence": "Table 1 shows OJBKQ achieves the lowest perplexity in most settings across LLaMA-2/3, Qwen3, and Mistral models. For example, on LLaMA-3-8B at 4-bit g128: GPTQ 9.41/6.54, AWQ 9.40/6.54, Ours 9.33/6.48 (C4/WikiText-2).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "The computational overhead of Random-K decoding is modest, with approximately 80% increase in per-layer runtime at K=25.", 375 "evidence": "Figure 4 shows relative time increase ratios per layer for K=1,5,10,25. At K=5 (default setting), overhead is approximately 15-20%.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Random-K exploration improves over deterministic Babai rounding by finding superior integer candidates.", 380 "evidence": "Tables 1-3 consistently show Ours(R) improving over Ours(N). Figure 1 shows JTA reconstruction error decreasing with higher K across layers and modules.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "The JTA objective provides a better candidate selection criterion than using either runtime-quantized or full-precision references alone.", 385 "evidence": "Tables 1-3 show Ours (with JTA) consistently outperforming Ours(R) (without JTA). Figure 3 shows a U-shaped curve for µ, confirming neither extreme (µ=0 or µ=1) is optimal. The ablation in Table 4 further demonstrates this across a grid of µ and λ values.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "The advantage of OJBKQ becomes more pronounced at 3-bit quantization compared to 4-bit.", 390 "evidence": "Table 1 shows larger performance gaps at 3-bit: on LLaMA-3-8B without grouping, GPTQ degrades to 29.42/23.81 while Ours achieves 11.38/8.04 on C4/WikiText-2. At 4-bit the gaps are much smaller.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "OJBKQ preserves reasoning capabilities under 4-bit quantization, closely tracking BF16 accuracy.", 395 "evidence": "Table 3 shows Ours achieving 84.29% on GSM8K for Qwen3-4B (vs 83.70% BF16) and 48.22% on GSM8K for LLaMA3-8B (vs 51.86% BF16). Average reasoning accuracy is competitive with or better than all baselines.", 396 "supported": "moderate" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "No error bars despite randomized method", 402 "detail": "The method uses Klein's randomized algorithm with K=5 trials, inherently producing variable results. Yet all experiments report single-run point estimates with no standard deviation, variance, or confidence intervals. The reader cannot assess whether observed improvements are within noise." 403 }, 404 { 405 "flag": "Selective presentation of results", 406 "detail": "Ours(N) performs worse than GPTQ on Qwen3-0.6B at 3-bit (47.23 vs 43.60 on C4) and Ours(N) gives 25.46 vs GPTQ's 18.16 on Q3-4B at 3-bit g128 W3A16, but these regressions are not discussed. The paper only highlights cases where the method wins." 407 }, 408 { 409 "flag": "No code release", 410 "detail": "The method involves non-trivial GPU kernel design (PPI-KBabai) and specific implementation choices, but no code is released, making independent reproduction difficult." 411 }, 412 { 413 "flag": "Calibration-evaluation overlap", 414 "detail": "C4 is used for both calibration (128 samples) and perplexity evaluation. While the calibration set is small relative to the test set, this overlap is not discussed or analyzed for potential bias." 415 } 416 ], 417 "cited_papers": [ 418 { 419 "title": "GPT-4 technical report", 420 "authors": ["Josh Achiam"], 421 "year": 2023, 422 "arxiv_id": "2303.08774", 423 "relevance": "Foundational LLM whose quantization is a key deployment concern; one of the models motivating compression research." 424 }, 425 { 426 "title": "GPTQ: Accurate post-training quantization for generative pre-trained transformers", 427 "authors": ["Elias Frantar", "Saleh Ashkboos", "Torsten Hoefler", "Dan Alistarh"], 428 "year": 2023, 429 "relevance": "Primary PTQ baseline; established the sequential rounding approach for LLM quantization that this paper improves upon." 430 }, 431 { 432 "title": "AWQ: Activation-aware weight quantization for LLM compression and acceleration", 433 "authors": ["Ji Lin", "Jiaming Tang", "Haotian Tang"], 434 "year": 2024, 435 "relevance": "Key PTQ baseline using salience-based scaling for LLM compression." 436 }, 437 { 438 "title": "QUIP: 2-bit quantization of large language models with guarantees", 439 "authors": ["Jerry Chee", "Yaohui Cai", "Volodymyr Kuleshov", "Christopher De Sa"], 440 "year": 2024, 441 "relevance": "Rotation-based PTQ method providing theoretical quantization guarantees for LLMs." 442 }, 443 { 444 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 445 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 446 "year": 2023, 447 "arxiv_id": "2305.05176", 448 "relevance": "Addresses LLM deployment cost reduction, the same practical concern motivating quantization research." 449 }, 450 { 451 "title": "The geometry of LLM quantization: GPTQ as Babai's nearest plane algorithm", 452 "authors": ["Jiale Chen", "Yalda Shabanzadeh", "Elvir Crnčević", "Torsten Hoefler", "Dan Alistarh"], 453 "year": 2025, 454 "relevance": "Establishes the theoretical connection between PTQ and lattice decoding that this paper builds upon." 455 }, 456 { 457 "title": "Quantization error propagation: Revisiting layer-wise post-training quantization", 458 "authors": ["Yamato Arai", "Yuma Ichikawa"], 459 "year": 2025, 460 "relevance": "Directly motivates the JTA objective by analyzing how layer-wise quantization errors propagate." 461 }, 462 { 463 "title": "Benchmarking post-training quantization in LLMs: Comprehensive taxonomy, unified evaluation, and comparative analysis", 464 "authors": ["Jiaqi Zhao", "Ming Wang", "Miao Zhang"], 465 "year": 2025, 466 "relevance": "Comprehensive PTQ survey that provides the taxonomy used to classify quantization methods in this paper." 467 }, 468 { 469 "title": "LLaMA: Open and efficient foundation language models", 470 "authors": ["Hugo Touvron"], 471 "year": 2023, 472 "arxiv_id": "2302.13971", 473 "relevance": "Key open-weight LLM family used as primary evaluation target for quantization methods." 474 }, 475 { 476 "title": "Mistral 7B", 477 "authors": ["Albert Q. Jiang"], 478 "year": 2023, 479 "relevance": "Open-weight LLM used as evaluation target, representing diverse model architectures for quantization." 480 } 481 ] 482 }