scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24065B)
      1 {
      2   "paper": {
      3     "title": "LPCD: Unified Framework from Layer-Wise to Submodule Quantization",
      4     "authors": ["Yuma Ichikawa", "Yudai Fujimoto", "Akira Sakai"],
      5     "year": 2025,
      6     "arxiv_id": "2512.01546"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": false,
     13         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper uses publicly available datasets (WikiText-2, C4) and publicly available models (LLaMA2, LLaMA3, Qwen3) from Hugging Face. No proprietary data was collected."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Section 5.1 states: 'We implement QEP, LoaQ, and LPCD using Python 3.12.11 with PyTorch 2.4.0 and Hugging Face Transformers 4.55.3. All experiments were conducted on an NVIDIA H100 GPU using the TSUBAME 4.0 supercomputer.' This provides specific library versions and hardware."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided. The experimental setup is described in Section 5.1, but not at the level of executable commands."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Tables 1 and 2 report only point estimates for perplexity and zero-shot accuracy. No confidence intervals or error bars are provided."
     36       },
     37       "significance_tests": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper claims LPCD 'consistently outperforms' QEP and LoaQ, but no statistical significance tests are performed. Comparisons are based solely on comparing numbers in the tables."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper reports raw perplexity and accuracy numbers (Tables 1 and 2), but does not report effect sizes, relative improvements with baseline context, or any standardized measure of improvement magnitude in a systematic way."
     46       },
     47       "sample_size_justified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No justification is given for the choice of calibration dataset size (256 sequences of 2048 tokens from WikiText-2), or the number of models tested. The paper notes they observed overfitting with 128 samples but does not justify the chosen alternative."
     51       },
     52       "variance_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No standard deviations, variance, or spread measures are reported for any experimental results. Results appear to be single-run numbers without any indication of variability across runs or seeds."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The paper compares against QEP and LoaQ baselines using both RTN and GPTQ as underlying layer-wise quantizers, and includes FP16 full-precision reference numbers (Tables 1 and 2)."
     63       },
     64       "baselines_contemporary": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "QEP (Arai and Ichikawa, 2025), LoaQ (Lin and Wan, 2025), GPTAQ (Li et al., 2025), and Qronos (Zhang et al., 2025) are all from 2025. The baselines used (QEP and LoaQ) are contemporary."
     68       },
     69       "ablation_study": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The paper applies LPCD to three submodules (QK, VO, Up-Down) but does not provide an ablation study showing the individual contribution of each submodule. There is no experiment removing one component to measure its effect."
     73       },
     74       "multiple_metrics": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper reports both perplexity on WikiText-2 (Table 1) and zero-shot average accuracy on ARC-E and PIQA (Table 2)."
     78       },
     79       "human_evaluation": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "This is a model quantization paper evaluating mathematical optimization techniques. Human evaluation of model outputs is not relevant to the claims being made."
     83       },
     84       "held_out_test_set": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The calibration dataset uses WikiText-2 for computing Hessians and quantization targets, while evaluation uses the standard WikiText-2 test split for perplexity and separate ARC-E and PIQA benchmarks for zero-shot accuracy. These are standard held-out evaluation protocols."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Results are broken down by model (LLaMA2-7B, LLaMA2-13B, LLaMA3-8B, Qwen3-8B, Qwen3-14B), by bit-width (4-bit, 3-bit, 2-bit), and by base quantizer (RTN, GPTQ) in Tables 1 and 2."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper shows that at 2-bit precision, many configurations produce divergent perplexity (>1e3) and near-random accuracy, including some LPCD configurations. These failure modes are visible in the tables rather than hidden."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The results show cases where LPCD does not improve over baselines: e.g., at 4-bit GPTQ, LLaMA3-8B gets 6.3818 PPL vs LoaQ's 6.2109 (Table 1). At 2-bit GPTQ, LLaMA2-7B gets 341.3434 PPL vs QEP's 101.1521. These non-improvements are reported transparently."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The abstract claims LPCD 'generalizes existing methods' (supported by Proposition 4.1 and Remark 4.2 showing QEP and LoaQ as special cases) and 'consistently enhances both layer-wise PTQ methods and existing submodule approaches' (largely supported by Tables 1 and 2, though with some exceptions at 2-bit). The word 'consistently' is slightly generous given 2-bit results."
    110       },
    111       "causal_claims_justified": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper's causal claims are primarily about the mathematical framework: LPCD optimizes submodule-level objectives and projects back to layer-wise quantizers. The ablation-like structure (comparing QEP alone, LoaQ alone, and LPCD on top) constitutes controlled single-variable manipulation of the quantization method, which supports causal inference about LPCD's contribution."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The title 'Unified Framework from Layer-Wise to Submodule Quantization' and abstract claim of a 'unified framework' are broad, while experiments are limited to per-channel weight-only quantization on two LLM families (LLaMA and Qwen) evaluated on perplexity and two zero-shot tasks. The paper does not test on more diverse architectures, tasks, or quantization schemes despite the framework's claimed generality."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper does not discuss alternative explanations for the observed improvements. For instance, whether gains are due to the specific submodule decomposition, the additional gradient-based optimization epochs, or simply the larger calibration dataset used compared to LoaQ (256 vs 128 samples). No threats-to-validity or robustness checks are provided."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper specifies model families (LLaMA2-7B, LLaMA2-13B, LLaMA3-8B, Qwen3-4B, Qwen3-8B, Qwen3-14B) but does not provide exact Hugging Face model IDs, snapshot hashes, or version identifiers. Marketing names like 'LLaMA2-7B' without a specific checkpoint identifier are insufficient."
    132       },
    133       "prompts_provided": {
    134         "applies": false,
    135         "answer": false,
    136         "justification": "This paper does not use prompting. It performs post-training quantization, which involves mathematical optimization of weight matrices rather than LLM prompting."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Section 5.1 reports: grid search ranges for alpha (0 to 1, increment 0.1) and beta (0 to 1, increment 0.05), gradient-based optimization with batch size 8, 40 epochs, cosine scheduled learning rate starting at 1e-5, Adam optimizer with default PyTorch settings. Calibration dataset size (256 sequences of 2048 tokens) is also reported."
    142       },
    143       "scaffolding_described": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No agentic scaffolding is used. This is a quantization algorithm paper."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 5.1 documents the calibration data: '2048 tokens consisting of 256 sequences, randomly sampled from the WikiText-2 dataset.' The paper also notes why they deviated from LoaQ's setup (128 samples from C4) due to observed overfitting."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion mentions future work directions but does not substantively discuss limitations of the current approach."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "No specific threats to validity are discussed. The paper does not address potential confounds such as the effect of different calibration data, sensitivity to hyperparameters, or computational overhead."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper does not explicitly state what the results do not show. Section 5.1 mentions 'we focus only on the per-channel weight quantization scheme' but does not frame this as a scope limitation. The conclusion mentions future work directions (nonlinear submodules, joint weight/activation/KV quantization) without explicitly stating these as current limitations."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No raw experimental data (quantized weights, intermediate results, logs) is available for independent verification. Only aggregated numbers in tables are provided."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The data collection procedure is described: calibration data is 256 sequences of 2048 tokens randomly sampled from WikiText-2. Evaluation uses standard WikiText-2 test split for perplexity and ARC-E/PIQA benchmarks for zero-shot accuracy. Models are obtained from standard open-weight releases."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": false,
    184         "answer": false,
    185         "justification": "No human participants are involved. All data sources are standard benchmarks and publicly available LLMs."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The pipeline is documented: (1) obtain calibration data from WikiText-2, (2) apply LoaQ to each submodule, (3) perform LPCD optimization with gradient-based method, (4) apply layer-wise PTQ projection. Section 5.1 and Section 4.4 describe the procedure."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The Acknowledgements section discloses funding: 'This work was partially supported by JST BOOST, Japan (Grant No. JPMJBY24D0), and by the Cabinet Office, Government of Japan, through the SIP program.'"
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Author affiliations are listed: Yuma Ichikawa (Fujitsu Limited, RIKEN Center for AIP), Yudai Fujimoto (Fujitsu Limited, Institute of Science Tokyo), Akira Sakai (Fujitsu Limited, Tokai University). One author (Ichikawa) is also an author of QEP, one of the baselines."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The funders are JST BOOST (Japan Science and Technology Agency) and the Cabinet Office of Japan through SIP. These are government research agencies with no direct financial stake in quantization method outcomes."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests statement is present in the paper. The authors work at Fujitsu Limited, which has commercial interest in LLM deployment efficiency, but no declaration of financial interests is made."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": false,
    218         "answer": false,
    219         "justification": "This paper performs post-training quantization of model weights. It does not evaluate pre-trained model capabilities on benchmarks in a way where training data contamination would be relevant. The perplexity and accuracy measurements assess quantization quality, not model knowledge."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": false,
    223         "answer": false,
    224         "justification": "Not applicable for the same reason: the paper evaluates quantization methods, not model knowledge. The concern is whether quantization preserves model quality, not whether the model memorized the test data."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "Not applicable: the paper measures quantization-induced degradation relative to full-precision baselines. Even if benchmark contamination existed, it would affect the FP16 baseline and quantized models equally."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No human participants are involved in this study."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants are involved in this study."
    242       },
    243       "demographics_reported": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants are involved in this study."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants are involved in this study."
    252       },
    253       "randomization_described": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants are involved in this study."
    257       },
    258       "blinding_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants are involved in this study."
    262       },
    263       "attrition_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants are involved in this study."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "No inference cost, latency, or wall-clock time is reported for the quantization process. The paper does not report how long LPCD takes compared to QEP or LoaQ, which is relevant given the additional gradient-based optimization (40 epochs per submodule)."
    274       },
    275       "compute_budget_stated": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "The paper states experiments were conducted on NVIDIA H100 GPU using the TSUBAME 4.0 supercomputer, but does not quantify total GPU hours, wall-clock time, or computational cost of the LPCD optimization process."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "LPCD generalizes QEP and LoaQ as special cases within a unified framework.",
    285       "evidence": "Proposition 4.1 proves QEP is equivalent to a single LPCD update. Remark 4.2 shows LoaQ is also a single LPCD update for a suitably extended objective (Section 4.3).",
    286       "supported": "strong"
    287     },
    288     {
    289       "claim": "LPCD-based submodule quantization consistently enhances both layer-wise PTQ methods and existing submodule approaches.",
    290       "evidence": "Tables 1 and 2 show LPCD achieves the lowest perplexity and highest accuracy in most configurations across 5 models, 3 bit-widths, and 2 base quantizers. However, at 2-bit GPTQ, LPCD underperforms QEP on LLaMA2-7B (341.3434 vs 101.1521 PPL) and several other 2-bit configurations.",
    291       "supported": "moderate"
    292     },
    293     {
    294       "claim": "RTN combined with LPCD surpasses the more sophisticated QEP+GPTQ configuration for Qwen-3-8B.",
    295       "evidence": "Table 1 at 3-bit: RTN+LPCD achieves 12.7110 PPL vs QEP+GPTQ 15.0779 PPL. Table 2 at 3-bit: RTN+LPCD achieves 0.6291 accuracy vs QEP+GPTQ 0.5986 (Section 5.2).",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "LPCD consistently yields lower quantization error than QEP and LoaQ across Transformer blocks.",
    300       "evidence": "Figure 1 shows output MSE across Transformer blocks in Llama 3 8B at 4, 3, and 2-bit weight quantization. LPCD consistently shows lower MSE across all layers in all three bit-width settings.",
    301       "supported": "moderate"
    302     }
    303   ],
    304   "methodology_tags": ["benchmark-eval", "theoretical"],
    305   "key_findings": "LPCD is a unified post-training quantization framework that extends layer-wise PTQ to arbitrary Transformer submodules (QK, VO, MLP Up-Down) through a relaxation-projection approach. The framework mathematically generalizes QEP and LoaQ as special cases corresponding to single LPCD iterations. Experimental results on LLaMA2/3 and Qwen3 models show LPCD improves perplexity and zero-shot accuracy over QEP and LoaQ in most configurations, with the largest gains at 3-bit quantization, though 2-bit results remain unstable across methods.",
    306   "red_flags": [
    307     {
    308       "flag": "Self-comparison with own prior work",
    309       "detail": "Author Yuma Ichikawa is also an author of QEP (Arai and Ichikawa, 2025), one of the two primary baselines. While the comparison is fair on its face, the paper's framing as a generalization of the authors' own prior method warrants noting."
    310     },
    311     {
    312       "flag": "No variance or uncertainty quantification",
    313       "detail": "All results are reported as single-run point estimates without error bars, standard deviations, or confidence intervals. Given that quantization involves random sampling of calibration data and gradient-based optimization, the variability of results across runs is unknown."
    314     },
    315     {
    316       "flag": "No ablation study for submodule contributions",
    317       "detail": "LPCD is applied to three submodules (QK, VO, Up-Down) simultaneously, but no ablation isolates the contribution of each submodule. It is unclear which submodule drives the improvements."
    318     },
    319     {
    320       "flag": "No computational cost comparison",
    321       "detail": "LPCD adds gradient-based optimization (40 epochs with Adam) on top of existing methods. The additional computational cost is not quantified, making it impossible to assess the cost-quality tradeoff."
    322     },
    323     {
    324       "flag": "Inconsistent 2-bit results",
    325       "detail": "At 2-bit precision, LPCD sometimes dramatically underperforms QEP (e.g., 341.3 vs 101.2 PPL on LLaMA2-7B GPTQ) despite claims of consistent enhancement. The paper does not explain these failures."
    326     }
    327   ],
    328   "cited_papers": [
    329     {
    330       "title": "GPTQ: Accurate Post-Training Quantization for Generative Pre-Trained Transformers",
    331       "authors": ["Elias Frantar", "Saleh Ashkboos", "Torsten Hoefler", "Dan Alistarh"],
    332       "year": 2022,
    333       "arxiv_id": "2210.17323",
    334       "relevance": "Foundational layer-wise PTQ method (GPTQ) used as a base quantizer in LPCD experiments."
    335     },
    336     {
    337       "title": "AWQ: Activation-Aware Weight Quantization for On-Device LLM Compression and Acceleration",
    338       "authors": ["Ji Lin", "Jiaming Tang", "Haotian Tang"],
    339       "year": 2024,
    340       "relevance": "Key layer-wise PTQ method based on salience-aware weight scaling, relevant to LLM quantization methodology."
    341     },
    342     {
    343       "title": "QuIP: 2-bit Quantization of Large Language Models With Guarantees",
    344       "authors": ["Jerry Chee", "Yaohui Cai", "Volodymyr Kuleshov", "Christopher M De Sa"],
    345       "year": 2023,
    346       "relevance": "Rotation-based quantization method with theoretical guarantees for extreme low-bit LLM quantization."
    347     },
    348     {
    349       "title": "Quantization Error Propagation: Revisiting Layer-Wise Post-Training Quantization",
    350       "authors": ["Yamato Arai", "Yuma Ichikawa"],
    351       "year": 2025,
    352       "relevance": "QEP is a direct predecessor and primary baseline, shown to be a special case of LPCD."
    353     },
    354     {
    355       "title": "LoaQ: Layer-Wise Output Approximation Quantization",
    356       "authors": ["Li Lin", "Xiaojun Wan"],
    357       "year": 2025,
    358       "arxiv_id": "2509.06297",
    359       "relevance": "LoaQ extends layer-wise PTQ to residual connections and RMSNorm; second primary baseline shown as LPCD special case."
    360     },
    361     {
    362       "title": "PV-Tuning: Beyond Straight-Through Estimation for Extreme LLM Compression",
    363       "authors": ["Vladimir Malinovskii", "Denis Mazur", "Ivan Ilin"],
    364       "year": 2024,
    365       "relevance": "Block-wise and global PTQ method using alternating optimization, relevant to understanding LPCD's position in the quantization landscape."
    366     },
    367     {
    368       "title": "The LLaMA 3 Herd of Models",
    369       "authors": ["Aaron Grattafiori"],
    370       "year": 2024,
    371       "arxiv_id": "2407.21783",
    372       "relevance": "Primary evaluation model family (LLaMA3-8B) used to benchmark quantization methods."
    373     },
    374     {
    375       "title": "GPTAQ: Efficient Finetuning-Free Quantization for Asymmetric Calibration",
    376       "authors": ["Yuhang Li", "Ruokai Yin", "Donghyun Lee"],
    377       "year": 2025,
    378       "relevance": "Contemporary layer-wise PTQ method that refines quantization via calibration asymmetry correction."
    379     },
    380     {
    381       "title": "Qronos: Correcting the Past by Shaping the Future... in Post-Training Quantization",
    382       "authors": ["Shihao Zhang", "Haoyu Zhang", "Ian Colbert", "Rayan Saab"],
    383       "year": 2025,
    384       "arxiv_id": "2505.11695",
    385       "relevance": "Contemporary error compensation method for layer-wise PTQ, relevant to understanding the competitive landscape."
    386     },
    387     {
    388       "title": "Benchmarking Post-Training Quantization in LLMs: Comprehensive Taxonomy, Unified Evaluation, and Comparative Analysis",
    389       "authors": ["Jiaqi Zhao", "Ming Wang", "Miao Zhang"],
    390       "year": 2025,
    391       "arxiv_id": "2502.13178",
    392       "relevance": "Comprehensive benchmarking study of PTQ methods for LLMs, relevant to methodology quality assessment in quantization research."
    393     },
    394     {
    395       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    396       "authors": ["Edward J Hu", "Yelong Shen", "Phillip Wallis"],
    397       "year": 2022,
    398       "relevance": "LPCD extends to LoRA-based error compensation in Appendix B.4, connecting quantization to parameter-efficient fine-tuning."
    399     }
    400   ]
    401 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs