scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25466B)
      1 {
      2   "paper": {
      3     "title": "The case for 4-bit precision: k-bit Inference Scaling Laws",
      4     "authors": ["Tim Dettmers", "Luke Zettlemoyer"],
      5     "year": 2022,
      6     "venue": "arXiv",
      7     "arxiv_id": "2212.09720"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL, code archive, or link to source code is provided in the paper. The paper describes experiments and methods but does not release any implementation."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available models (OPT, BLOOM, Pythia, GPT-2) and publicly available evaluation benchmarks (EleutherAI LM Evaluation Harness, The Pile Common Crawl, LAMBADA, Winogrande, HellaSwag, PiQA). All data sources are public."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided. The paper does not list library versions or dependencies needed to reproduce the experiments."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README with commands, or reproduction scripts are provided. The experimental setup section (Section 4) describes the general approach but not enough to replicate without guessing implementation details."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper reports point estimates of zero-shot accuracy and perplexity. No confidence intervals or error bars are reported on the main results figures or tables."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes claims such as '4-bit precision is almost universally optimal' and compares scaling curves across precisions, but no statistical significance tests (p-values, t-tests, etc.) are used to support these comparative claims."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper provides effect sizes in context throughout. For example, Section 5.2 states 'Going from a block size of 1024 to 64 adds 0.24 bits per parameter but improves zero-shot accuracy almost as much as going from 4-bit to 5-bit precision.' Figures show absolute accuracy values across conditions, providing baseline context for differences."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper states 'more than 35,000 experiments' but does not justify why this number was chosen or provide any power analysis. The number of zero-shot evaluation examples per task is not discussed or justified."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance across seeds, or spread measures are reported. The paper notes that zero-shot accuracy is 'noisier' than perplexity (Section 4) but does not quantify this noise through variance reporting. Single-run results appear to be reported."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper uses 16-bit (unquantized) models as the baseline and compares across multiple bit precisions (3 to 8-bit). It also compares against GPTQ (one-shot quantization method) in Section 7 and Table 1."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The baselines include GPTQ (Frantar et al., 2022), SmoothQuant (Xiao et al., 2022), ZeroQuant (Yao et al., 2022), and nuQmm (Park et al., 2022), all published in 2022 — contemporary with this work."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper systematically ablates quantization components: data types (Integer, Float, Quantile, Dynamic Exponent), block sizes (64, 256, 512, 1024), and outlier-dependent quantization (proxy quantization). Sections 5.2 and Appendix C provide detailed ablation results."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses both mean zero-shot accuracy (across LAMBADA, PiQA, Winogrande, HellaSwag) and perplexity on The Pile Common Crawl as evaluation metrics. Section 4 discusses the correlation between the two (-0.94 Pearson)."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a quantization benchmark study measuring model accuracy under different precision settings. Human evaluation is irrelevant to the claims about optimal bit precision for LLM inference."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper evaluates on standard zero-shot benchmarks (LAMBADA, PiQA, Winogrande, HellaSwag) and The Pile Common Crawl perplexity. These are established held-out test sets not used for any tuning decisions in this work (zero-shot quantization requires no data)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper provides breakdowns per model family (OPT, BLOOM, BLOOMZ, Pythia, GPT-2) in Figure 2, per data type in Figure 3/9, per block size in Figure 3/8, and per individual benchmark (e.g., LAMBADA in Figure 5). Appendix figures provide extensive per-model breakdowns."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 5.1 discusses 3-bit instabilities in OPT and Pythia models. Section 5.2 discusses where proxy quantization fails to improve scaling despite stabilizing 3-bit models. Appendix B reports negative results on distribution centering."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 5.2 reports that no quantization methods improve 6-to-8-bit scaling. Appendix B reports that distribution centering is ineffective. Section 5.2 reports that proxy quantization does not improve bit-level scaling despite improving stability. These are clearly negative findings."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims 4-bit is 'almost universally optimal,' which is supported by Figure 2 showing 4-bit dominance across OPT, BLOOM, GPT-2, and Pythia with the noted exception of BLOOM-176B at 3-bit. The abstract's claim about block size and data type improvements is supported by Figures 3, 8, and 9. The 'almost' qualifier is honest."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims through ablation design — e.g., 'data types improve scaling' and 'small block size improves scaling' (Section 5.2). These are based on controlled single-variable manipulation: changing one quantization parameter while holding others fixed. The ablation methodology is adequate for these causal inferences."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 8 (Discussion & Limitations) explicitly bounds generalizations: scaling laws are 'only valid for cases where the mini-batch does not fit into the L1 cache,' certain quantization method classes were not tested, and optimized GPU implementations are lacking. The title and claims specify 'zero-shot' quantization and 'inference' rather than making overly broad claims."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 8 discusses alternative explanations: the 3-bit instability is related to emergent outlier features (analyzed in Section 5.2); the possibility that optimized data types could change the 4-bit optimality; and the distinction between zero-shot and one-shot quantization methods possibly yielding different optimal precisions."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper specifies exact model families and sizes: OPT (125M to 175B), BLOOM and BLOOMZ (560M to 176B), Pythia/NeoX (19M to 20B), and GPT-2 (117M to 1.5B). These are open-source models with well-defined parameter counts serving as their version identifiers, cited with their original papers (Zhang et al. 2022, Scao et al. 2022, Black et al. 2022, Radford et al. 2019)."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "This paper does not use prompting. It evaluates zero-shot performance on standard benchmarks using the EleutherAI LM Evaluation Harness, which has standardized evaluation protocols. No custom prompts are designed."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper specifies key quantization hyperparameters: block sizes (64, 128, 256, 512, 1024), bit precisions (3-8, 16), data types (Integer, Float, Quantile, Dynamic Exponent), exponent bit configurations (Appendix C.4), and proxy quantization threshold (top 2% of outlier dimensions). Evaluation is done using the standardized EleutherAI harness in 'GPT-2 setting' (Section 4)."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a quantization study evaluating model performance under different precision settings."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4 describes the evaluation setup including the specific tasks used, the evaluation harness configuration ('GPT-2 setting'), and the quantization procedure. Appendix A provides full specification of all data types used. The quantization formulas are given explicitly (Equations 1-8)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 8 is titled 'Discussion & Limitations' and provides substantive discussion of multiple limitations across several paragraphs."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 8 discusses specific threats: (1) certain quantization method classes were not tested (learned data types, optimization-based methods); (2) lack of optimized GPU implementations means practical speedups are uncertain; (3) scaling laws are only valid for small batch sizes where mini-batch fits in L1 cache; (4) attention operations are not addressed for inference latency. These are all specific to this study."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 8 explicitly states: 'our scaling laws are only valid for cases where the mini-batch does not fit into the L1 cache of the device, and beyond this, a new set of scaling laws is required.' It also states that certain quantization methods (e.g., data types optimized with input data) were not tested. Section 7 specifies the recommendation is for 'zero-shot quantized models for inference.'"
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The raw experimental results from the 35,000+ experiments are not released. Only aggregated plots and scaling curves are shown in the paper."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4 describes the experimental setup: 16-bit inputs with k-bit parameters, evaluation using the EleutherAI LM Evaluation Harness on four zero-shot tasks plus perplexity on The Pile Common Crawl. The model families, parameter scales, and bit precisions are all specified."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. The study uses standard public benchmarks and publicly available LLMs."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is documented: models are loaded, quantized to k-bit precision using specified data types and block sizes (Section 4, Appendix A), then evaluated on standardized benchmarks using the EleutherAI harness. The quantization procedure is formalized mathematically (Equations 1-8)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper. There is no mention of grants, sponsors, or financial support."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Both authors are identified as affiliated with the University of Washington (footnote 1). This is an academic affiliation with no obvious product being evaluated."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of funding disclosure means this question cannot be answered affirmatively."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial interest declaration is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates pre-trained models (OPT, BLOOM, Pythia, GPT-2) on benchmarks but does not state the training data cutoff dates for any of these models."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether the zero-shot evaluation benchmarks (LAMBADA, PiQA, Winogrande, HellaSwag) appeared in the training data of the evaluated models. This is relevant since these are well-known benchmarks and the models were trained on large internet corpora."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "All four zero-shot benchmarks (LAMBADA 2016, PiQA 2020, Winogrande 2021, HellaSwag 2019) were published before the training data collection for models like OPT and BLOOM. The paper does not discuss contamination risk despite this. However, since the paper's core contribution is comparing quantization precisions on the SAME models (relative rather than absolute performance), contamination would affect all conditions equally and is less critical to the claims."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "While the paper discusses inference latency conceptually (Section 2.1) and provides theoretical speedup calculations (e.g., 4.46x for 3-bit vs 16-bit), it does not report actual inference costs, wall-clock times, or API costs for the 35,000+ experiments."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Section 7 provides a partial compute estimate: 'repeating our scaling experiments for GPTQ only for the OPT-175B and BLOOM-176B models would consume an estimated 5,120 GPU days of compute.' Appendix C.5 mentions not completing certain evaluations due to compute constraints ('before the conference deadline'). While the total budget for all 35,000 experiments is not explicitly stated, the scale is indicated."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "4-bit precision is almost universally optimal for total model bits and zero-shot accuracy across all model scales and families tested.",
    286       "evidence": "Figure 2 shows bit-level scaling laws for OPT, BLOOM, Pythia, and GPT-2 from 19M to 176B parameters. 4-bit consistently provides the best accuracy per total model bit, with the only exception being BLOOM-176B where 3-bit is slightly better (Section 5.1).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "3-bit quantization causes performance degradation and instability, especially in OPT and Pythia models.",
    291       "evidence": "Section 5.1, observation 3: 'Pythia and OPT are unstable for 3-bit inference where performance is close to random (35%) for the largest Pythia/OPT models.' Figure 2 and Figure 4 demonstrate the instability and degradation at 3-bit.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Small block sizes (64-128) and appropriate data types (float, quantile) are the most effective methods to improve 4-bit scaling.",
    296       "evidence": "Section 5.2 and Figures 3, 8, 9 show block size improvements: 'Going from a block size of 1024 to 64 adds 0.24 bits per parameter but improves zero-shot accuracy almost as much as going from 4-bit to 5-bit precision.' Quantile quantization shown as best data type across models in Figures 9 and 14.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "No quantization methods improve bit-level scaling for 6 to 8-bit precision.",
    301       "evidence": "Section 5.2 and Appendix C.3 (Figures 10, 11): 'We combine all possible combinations of quantization methods with 6 to 8-bit quantization, and we find that none of these methods improve bit-level scaling.'",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Proxy quantization removes 3-bit instabilities but does not improve bit-level scaling beyond 4-bit.",
    306       "evidence": "Section 5.2 and Figure 4: Proxy quantization stabilizes OPT and Pythia at 3-bit but '4-bit precision still provides better scaling.' For 4-bit, 'outlier-dependent quantization has no scaling benefit.'",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Perplexity and zero-shot accuracy are highly correlated (Pearson r = -0.94) across 35,000+ experiments.",
    311       "evidence": "Section 4 states: 'across more than 35,000 zero-shot experiments, the Pearson correlation coefficient between The Pile Common Crawl perplexity and zero-shot performance is -0.94.'",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "4-bit quantization is almost universally optimal for the trade-off between total model bits and zero-shot accuracy across five LLM families (OPT, BLOOM, BLOOMZ, Pythia, GPT-2) spanning 19M to 176B parameters. Using a small block size (64-128) and float/quantile data types provides the most effective improvements to 4-bit scaling, while no quantization methods improve scaling at 6-8 bit precision. 3-bit quantization causes instabilities in OPT and Pythia due to emergent outlier features, which can be stabilized via proxy quantization but still do not match 4-bit scaling efficiency.",
    317   "red_flags": [
    318     {
    319       "flag": "No uncertainty quantification on main results",
    320       "detail": "Despite running 35,000+ experiments, no confidence intervals, error bars, or variance measures are reported. The paper acknowledges zero-shot accuracy is 'noisier' than perplexity but does not quantify the noise. Scaling trends are shown as point estimates connected by lines."
    321     },
    322     {
    323       "flag": "No code release",
    324       "detail": "For a paper claiming to establish scaling laws based on 35,000+ experiments, the lack of released code, raw data, or experiment logs makes independent verification difficult."
    325     }
    326   ],
    327   "cited_papers": [
    328     {
    329       "title": "GPTQ: Accurate Post-Training Quantization for Generative Pre-Trained Transformers",
    330       "authors": ["Elias Frantar", "Saleh Ashkboos", "Torsten Hoefler", "Dan Alistarh"],
    331       "year": 2022,
    332       "arxiv_id": "2210.17323",
    333       "relevance": "Key one-shot quantization method compared against zero-shot methods in this paper; directly relevant to LLM inference efficiency."
    334     },
    335     {
    336       "title": "LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale",
    337       "authors": ["Tim Dettmers", "Mike Lewis", "Younes Belkada", "Luke Zettlemoyer"],
    338       "year": 2022,
    339       "relevance": "Prior work by same lead author on 8-bit quantization for LLMs; established outlier feature detection that this paper extends with proxy quantization."
    340     },
    341     {
    342       "title": "SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models",
    343       "authors": ["Guangxuan Xiao", "Ji Lin", "Mickael Seznec", "Julien Demouth", "Song Han"],
    344       "year": 2022,
    345       "arxiv_id": "2211.10438",
    346       "relevance": "Contemporary quantization method addressing outlier features in LLMs; relevant to understanding activation-weight quantization tradeoffs."
    347     },
    348     {
    349       "title": "ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers",
    350       "authors": ["Zhewei Yao", "Reza Yazdani Aminabadi", "Minjia Zhang", "Xiaoxia Wu", "Conglong Li", "Yuxiong He"],
    351       "year": 2022,
    352       "arxiv_id": "2206.01861",
    353       "relevance": "Zero-shot quantization method for LLMs; directly relevant baseline for comparing quantization approaches at different bit precisions."
    354     },
    355     {
    356       "title": "Scaling Laws for Neural Language Models",
    357       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan", "Tom B. Brown", "Benjamin Chess", "Rewon Child", "Scott Gray", "Alec Radford", "Jeffrey Wu", "Dario Amodei"],
    358       "year": 2020,
    359       "arxiv_id": "2001.08361",
    360       "relevance": "Foundational scaling laws paper that this work extends to the bit-precision dimension; establishes methodology for studying how variables change with scale."
    361     },
    362     {
    363       "title": "OPT: Open Pre-trained Transformer Language Models",
    364       "authors": ["Susan Zhang", "Stephen Roller", "Naman Goyal", "Mikel Artetxe", "Moya Chen"],
    365       "year": 2022,
    366       "arxiv_id": "2205.01068",
    367       "relevance": "One of the primary model families evaluated across all scales (125M to 175B); relevant as a major open-source LLM used in evaluation studies."
    368     },
    369     {
    370       "title": "BLOOM: A 176B-Parameter Open-Access Multilingual Language Model",
    371       "authors": ["Teven Le Scao", "Angela Fan", "Christopher Akiki"],
    372       "year": 2022,
    373       "arxiv_id": "2211.05100",
    374       "relevance": "Another primary model family evaluated; relevant as an open multilingual LLM for studying quantization behavior."
    375     },
    376     {
    377       "title": "GLM-130B: An Open Bilingual Pre-trained Model",
    378       "authors": ["Aohan Zeng", "Xiao Liu", "Zhengxiao Du", "Zihan Wang"],
    379       "year": 2022,
    380       "arxiv_id": "2210.02414",
    381       "relevance": "Related work on LLM quantization for inference that studied 4-bit vs 16-bit scaling trends; this paper extends that analysis to 3-8 bit range."
    382     },
    383     {
    384       "title": "Efficiently Scaling Transformer Inference",
    385       "authors": ["Reiner Pope", "Sholto Douglas", "Aakanksha Chowdhery", "Jacob Devlin"],
    386       "year": 2022,
    387       "arxiv_id": "2211.05102",
    388       "relevance": "Studies scaling inference in production settings with large batch sizes; complementary perspective to this paper's small-batch focus on bit-level efficiency."
    389     },
    390     {
    391       "title": "A Framework for Few-Shot Language Model Evaluation",
    392       "authors": ["Leo Gao", "Jonathan Tow", "Stella Biderman"],
    393       "year": 2021,
    394       "relevance": "The EleutherAI LM Evaluation Harness used as the primary evaluation framework in this paper; critical infrastructure for LLM benchmark evaluation."
    395     }
    396   ]
    397 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs