scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24839B)
      1 {
      2   "paper": {
      3     "title": "LUT-LLM: Efficient Large Language Model Inference with Memory-based Computations on FPGAs",
      4     "authors": [
      5       "Zifan He",
      6       "Shengyu Ye",
      7       "Rui Ma",
      8       "Yang Wang",
      9       "Jason Cong"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv preprint (submitted to ACM conference)",
     13     "arxiv_id": "2511.06174"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No repository URL or code archive is provided in the paper. The paper mentions using TAPA framework and Vivado tools but does not release its own source code."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper uses publicly available datasets: GLUE benchmark, SQuAD v2, MMLU-Pro, and the FineWeb dataset for training. The Qwen 3 model is also publicly available."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Section 5.1 specifies the FPGA (AMD Alveo V80), Xilinx Vitis HLS 2024.2, TAPA framework, RapidStream for floorplanning, Vivado 2024.2, and GPU benchmarking setup with vLLM. Hardware configurations are detailed in Table 2."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No step-by-step reproduction instructions are provided. The paper describes the design methodology and tools used but does not include commands or scripts to reproduce the results."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No confidence intervals or error bars are reported for any results. Latency and energy efficiency numbers are presented as point estimates without uncertainty bounds."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper makes comparative claims (e.g., '1.66x lower latency than MI210') without any statistical significance tests. Comparisons are based solely on point estimates."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Effect sizes are reported as speedup ratios with baseline context throughout (e.g., '1.66x lower geomean latency than MI210', '1.72x more energy efficient than A100', '5.6x and 1.9x faster than Allo and InTAR'). These ratios provide clear magnitude context."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No justification for the range of input/output configurations tested. The context window is limited to 512 tokens due to training constraints but no power analysis or sample size justification is given for the benchmark evaluation points."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No variance or standard deviation is reported across experimental runs. All latency and energy efficiency results appear to be single-run measurements without reporting any spread."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Multiple baselines are included: GPUs (AMD MI210, NVIDIA A100) with both BF16 and INT8 precision, and FPGA baselines (Allo, InTAR, FlightLLM). Quantization baselines (SmoothQuant, GPTQ, RTN INT8) are also compared for model quality."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Baselines include recent works: Allo (PLDI 2024), InTAR (FCCM 2025), FlightLLM (FPGA 2024), FlashAttention-2 (2023), FlashDecoding (2023), GPTQ Marlin kernel (2025). These represent the current state of the art."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper ablates the quantization scheme: Table 3 shows progressive quality impact of activation quantization, INT8 LUT quantization, and weight quantization. Section 3 compares weight-only, activation-only, and activation-weight co-quantization performance models. Section 4 compares pure dataflow vs. sequential vs. hybrid execution strategies."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Multiple metrics are used: end-to-end latency, decode latency, energy efficiency (tokens/Joule), and model quality metrics (accuracy and F1 across GLUE sub-tasks, SQuAD v2, MMLU-Pro). Resource utilization is also reported."
     85       },
     86       "human_evaluation": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "Human evaluation is not relevant for this hardware accelerator paper. The evaluation concerns latency, energy efficiency, and model quality on standard benchmarks."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Model quality is evaluated on standard benchmarks (GLUE, SQuAD v2, MMLU-Pro) which serve as held-out test sets. The model is trained on FineWeb and finetuned on WikiQA, then tested on these separate benchmarks."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Table 3 provides per-task breakdown across 7 GLUE sub-tasks and SQuAD v2. Figures 11 and 12 break down performance by input/output length combinations. Latency is broken down into prefill and decode stages."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper discusses where LUT-LLM underperforms: it is slower than A100 with GPTQ Marlin kernel (Section 5.3.2), the 512-token context window limitation due to training memory constraints is acknowledged (Section 5.1), and prefill performance narrows against InTAR as input length increases (Section 5.3.3)."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 3.2 reports that naive activation-only VQ is 'highly memory bounded for decoding' and 'unfavorable for tasks involving short inputs or long outputs.' The paper explicitly identifies that activation-only quantization loads 16x larger data than FP16 weights, motivating the co-quantization approach."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims 1.66x lower end-to-end latency than MI210 and 1.72x higher energy efficiency than A100 are supported by Figures 11 and 12 and the detailed results in Section 5.3.2. The 2.16x energy efficiency for 32B model scaling is discussed in Section 5.3.4."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Causal claims about why co-quantization outperforms other schemes are supported by the performance model in Section 3 with detailed mathematical derivations. The ablation of quantization schemes and execution strategies in Sections 3.2 and 4.4 provides controlled single-variable analysis."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The title 'Efficient Large Language Model Inference' implies broad applicability, but results are demonstrated only on a customized Qwen 3 1.7B model with a 512-token context window on a single FPGA. The 32B scaling result in Section 5.3.4 is only a performance model projection, not a measured result. The paper does not explicitly bound its claims to the tested configuration."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper does not discuss alternative explanations for the observed performance gains. For example, it does not consider whether the advantages come primarily from the process node differences, different power envelopes, or specific properties of the Qwen 3 architecture rather than the LUT-LLM approach itself."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper specifies 'Qwen 3 1.7B' as the target model and provides exact hardware part numbers: AMD Alveo V80 FPGA, AMD Instinct MI210, NVIDIA A100. Tool versions are specified: Xilinx Vitis HLS 2024.2, Vivado 2024.2."
    139       },
    140       "prompts_provided": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "This paper does not use prompting as part of its methodology. It is a hardware accelerator paper that evaluates inference latency and model quality on standard benchmarks."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Section 5.1 reports quantization hyperparameters (G=512, v=2, cw=16, ca=64, INT8 lookup tables, equivalent to W2A3), training setup (reconstruction loss ratio 0.1, STE with adjustable gradients, 3 epochs on WikiQA, 512-token sequences), and the achieved clock frequency (227 MHz)."
    149       },
    150       "scaffolding_described": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No agentic scaffolding is used. This is a hardware accelerator paper."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 5.1 documents the quantization-aware training pipeline: pretrain on FineWeb, finetune on WikiQA, apply activation vector quantization with two-stage training strategy from LUT-DLA, reconstruct weights, apply GPTVQ for weight quantization, then pre-compute 2D lookup tables. Per-tensor zero-point quantization is described with equations."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "There is no dedicated limitations section. Some limitations are mentioned in passing (512-token context window in Section 5.1, scaling discussion in Section 5.3.4) but there is no structured limitations or threats-to-validity section."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No threats to validity are discussed. Potential concerns such as FPGA vs GPU comparison fairness (different process nodes, power budgets, software maturity), simulator accuracy for FPGA baselines, and the customized model not being a standard Qwen 3 checkpoint are not addressed."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No explicit scope boundaries are stated. The paper does not clarify what the results do NOT show, such as that the 32B scaling is projected rather than measured, or that results may not generalize to other model architectures or longer context lengths."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No raw data (latency traces, power measurements, detailed per-layer timing) is available for independent verification."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 5.1 describes GPU benchmarking methodology: vLLM backend, FlashAttention and FlashDecoding enabled, GPTQ INT8 quantization, power monitoring via pyNVML and pyrsmi. FPGA baseline simulation methodology is described with 'less than 2% gap from data in the original papers.'"
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants. The paper evaluates hardware on standard benchmarks."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The data pipeline from model training through quantization to hardware deployment is documented in Section 5.1: pretrain on FineWeb -> finetune on WikiQA -> activation VQ training -> weight reconstruction -> GPTVQ -> 2D lookup table computation -> FPGA deployment. GPU benchmarking pipeline is also described."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No funding disclosure or acknowledgments section mentioning grants or sponsors is present in the paper."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Author affiliations are clearly disclosed: UCLA and Microsoft Research Asia. The footnote notes 'Work done during internship at Microsoft.'"
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "Three of five authors are from Microsoft Research Asia, which has commercial interest in efficient LLM inference. The AMD V80 FPGA is the target platform, and AMD hardware is used for comparison. No discussion of whether these affiliations create conflicts."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests or financial interests statement is present in the paper."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The paper uses Qwen 3 1.7B but does not state its training data cutoff date. This matters because the model is evaluated on GLUE, SQuAD v2, and MMLU-Pro."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No discussion of potential train/test overlap. GLUE and SQuAD v2 are old benchmarks (2018 and 2016) that could easily be in the training data of Qwen 3."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "GLUE (2018) and SQuAD v2 (2016) were available well before Qwen 3's training. MMLU-Pro (2024) is more recent but still likely pre-dates training. No contamination discussion is provided."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants in this study."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": true,
    280         "justification": "Inference latency is reported in milliseconds across multiple configurations (Figures 11, 14). Energy efficiency is reported in tokens per Joule (Figure 12, Table 5). Power consumption of the FPGA (peak 190W) and GPUs (300W) is stated in Table 2."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": true,
    285         "justification": "Section 5.1 states training cost: 'an eight-GPU cluster will take 2 weeks to first pretrain Qwen 3 1.7B on FineWeb for 512-token sequences and then finetune on WikiQA with 3 epochs, with 75% memory utilization.' FPGA resource utilization is detailed in Figure 13."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "LUT-LLM achieves 1.66x lower end-to-end latency than the AMD MI210 GPU for Qwen 3 1.7B inference.",
    292       "evidence": "Figure 11 shows latency comparison across input/output length configurations. Section 5.3.2 reports 'LUT-LLM has a 1.66x lower geomean latency than MI210, despite utilizing 7x less memory bandwidth.'",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "LUT-LLM is 1.72x more energy efficient than the NVIDIA A100 GPU.",
    297       "evidence": "Figure 12 shows energy efficiency comparisons. Section 5.3.2 states 'LUT-LLM remains 1.72x more energy efficient in geomean.'",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "LUT-LLM is 4.1x more energy efficient than AMD MI210.",
    302       "evidence": "Section 5.3.2 and Figure 12 report '4.1x higher geomean energy efficiency than MI210.'",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "Activation-weight co-quantization achieves the best performance across both prefill and decode stages.",
    307       "evidence": "Figure 6 compares normalized throughputs of various quantization schemes. Section 3.2 provides performance model analysis showing co-quantization has superior throughput. Section 3.1 derives that co-quantization gives 569 cycles vs. 8256 for activation-only and 1090 for weight-only in the worked example.",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "LUT-LLM's quantization incurs only 2.7% performance drop relative to FP16 baseline.",
    312       "evidence": "Table 3 shows model quality across GLUE sub-tasks and SQuAD v2 for different quantization schemes. Section 5.2 states '2.7% performance drop relative to the FP16 baseline with all techniques applied.'",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "LUT-LLM can scale to a 32B model with 2.16x better energy efficiency than the A100.",
    317       "evidence": "Section 5.3.4 mentions this claim but it is based on performance model projection, not actual implementation. The paper states 'the current training algorithm limits the model size' and describes extending 'with simple changes of loop bounds.'",
    318       "supported": "weak"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "benchmark-eval",
    323     "case-study"
    324   ],
    325   "key_findings": "LUT-LLM is the first FPGA accelerator for 1B+ language model inference using memory-based computation via vector quantization. The paper demonstrates that activation-weight co-quantization with 2D lookup tables outperforms weight-only or activation-only quantization by reducing memory bandwidth demands and on-chip port requirements. On AMD V80 FPGA running a customized Qwen 3 1.7B model, LUT-LLM achieves 1.66x lower latency than AMD MI210 and 1.72x higher energy efficiency than NVIDIA A100, while maintaining competitive model quality (2.7% drop from FP16 baseline).",
    326   "red_flags": [
    327     {
    328       "flag": "No variance or uncertainty quantification",
    329       "detail": "All performance results (latency, energy efficiency) are reported as single point estimates without any error bars, confidence intervals, or variance from multiple runs. Hardware measurements can vary due to thermal throttling, power states, and other factors."
    330     },
    331     {
    332       "flag": "Unfair FPGA baseline comparison",
    333       "detail": "The FPGA baselines (Allo, InTAR, FlightLLM) are from the older AMD U280 FPGA, while LUT-LLM uses the newer AMD V80. The paper constructs a simulator for these baselines 'with less than 2% gap from data in the original papers' rather than running them on the same hardware, introducing potential simulation bias."
    334     },
    335     {
    336       "flag": "Limited context window",
    337       "detail": "Results are limited to 512-token context windows due to training memory constraints. This is a significant practical limitation not prominently highlighted, as modern LLMs routinely handle 4K-128K tokens."
    338     },
    339     {
    340       "flag": "32B scaling claim is projection only",
    341       "detail": "The claim of 2.16x energy efficiency over A100 for 32B models (in the abstract) is based on a performance model projection, not actual implementation and measurement. This is not clearly distinguished from measured results."
    342     },
    343     {
    344       "flag": "No contamination analysis for model quality benchmarks",
    345       "detail": "Model quality is evaluated on GLUE (2018) and SQuAD v2 (2016) which are likely in Qwen 3's training data. Without contamination analysis, the quality preservation claims may be overstated."
    346     },
    347     {
    348       "flag": "No limitations section",
    349       "detail": "The paper lacks any dedicated discussion of limitations or threats to validity, despite several notable limitations (single model, single FPGA, limited context, simulated baselines, customized model)."
    350     }
    351   ],
    352   "cited_papers": [
    353     {
    354       "title": "FlightLLM: Efficient Large Language Model Inference with a Complete Mapping Flow on FPGAs",
    355       "authors": ["Shulin Zeng", "Jun Liu", "Guohao Dai"],
    356       "year": 2024,
    357       "relevance": "State-of-the-art FPGA LLM accelerator serving as a key baseline for hardware performance comparison."
    358     },
    359     {
    360       "title": "Allo: A Programming Model for Composable Accelerator Design",
    361       "authors": ["Hongzheng Chen", "Niansong Zhang", "Shaojie Xiang"],
    362       "year": 2024,
    363       "relevance": "FPGA accelerator framework for LLMs serving as a baseline, demonstrates composable accelerator design methodology."
    364     },
    365     {
    366       "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention",
    367       "authors": ["Woosuk Kwon", "Zhuohan Li", "Siyuan Zhuang"],
    368       "year": 2023,
    369       "relevance": "Introduces vLLM framework used as the GPU benchmarking backend in this work."
    370     },
    371     {
    372       "title": "GPTQ: Accurate Post-Training Quantization for Generative Pre-Trained Transformers",
    373       "authors": ["Elias Frantar", "Saleh Ashkboos", "Torsten Hoefler", "Dan Alistarh"],
    374       "year": 2022,
    375       "arxiv_id": "2210.17323",
    376       "relevance": "Key quantization method used for GPU baselines and as a comparison approach for model quality."
    377     },
    378     {
    379       "title": "SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models",
    380       "authors": ["Guangxuan Xiao", "Ji Lin", "Mickael Seznec"],
    381       "year": 2023,
    382       "relevance": "Scalar quantization baseline compared for model quality preservation under aggressive quantization."
    383     },
    384     {
    385       "title": "LUT-NN: Empower Efficient Neural Network Inference with Centroid Learning and Table Lookup",
    386       "authors": ["Xiaohu Tang", "Yang Wang", "Ting Cao"],
    387       "year": 2023,
    388       "relevance": "Foundational work on lookup-table-based neural network inference using vector quantization and centroid learning."
    389     },
    390     {
    391       "title": "LUT-DLA: Lookup Table as Efficient Extreme Low-Bit Deep Learning Accelerator",
    392       "authors": ["Guoyu Li", "Shengyu Ye", "Chunyun Chen"],
    393       "year": 2025,
    394       "relevance": "ASIC-based LUT accelerator design that LUT-LLM builds upon, providing the two-stage training strategy for centroids and weights."
    395     },
    396     {
    397       "title": "T-MAC: CPU Renaissance via Table Lookup for Low-Bit LLM Deployment on Edge",
    398       "authors": ["Jianyu Wei", "Shijie Cao", "Ting Cao"],
    399       "year": 2025,
    400       "relevance": "CPU-based table lookup approach for LLM inference, demonstrating memory-based computation on different hardware platforms."
    401     },
    402     {
    403       "title": "GPTVQ: The Blessing of Dimensionality for LLM Quantization",
    404       "authors": ["Mart Van Baalen", "Andrey Kuzmin", "Ivan Koryakovskiy"],
    405       "year": 2024,
    406       "arxiv_id": "2402.15319",
    407       "relevance": "Vector post-training quantization method for LLMs used in the weight quantization step of LUT-LLM's pipeline."
    408     },
    409     {
    410       "title": "FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning",
    411       "authors": ["Tri Dao"],
    412       "year": 2023,
    413       "arxiv_id": "2307.08691",
    414       "relevance": "Key GPU optimization enabling competitive GPU baselines for LLM inference performance comparison."
    415     },
    416     {
    417       "title": "Understanding the Potential of FPGA-based Spatial Acceleration for Large Language Model Inference",
    418       "authors": ["Hongzheng Chen", "Jiahao Zhang", "Yixiao Du"],
    419       "year": 2024,
    420       "relevance": "Foundational analysis of FPGA-based LLM acceleration that provides the performance modeling methodology built upon in this work."
    421     }
    422   ]
    423 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs