scan.json (28835B)
1 { 2 "paper": { 3 "title": "RooflineBench: A Benchmarking Framework for On-Device LLMs via Roofline Analysis", 4 "authors": [ 5 "Zhen Bi", 6 "Xueshu Chen", 7 "Luoyang Sun", 8 "Yuhang Yao", 9 "Qing Shen", 10 "Jungang Lou", 11 "Cheng Deng" 12 ], 13 "year": 2026, 14 "venue": "arXiv", 15 "arxiv_id": "2602.11506" 16 }, 17 "scan_version": 2, 18 "active_modules": ["experimental_rigor", "data_leakage"], 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Appendix C states: 'The code for this work is available at https://github.com/banbu-ai/roofline_bench.' A concrete GitHub URL is provided." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "Raw benchmark measurement data (inference logs, memory traces) is not released as a downloadable dataset. The paper shows results in figures and tables but no raw data files are provided." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "Hardware specifications are detailed in Table 1, and backends (CUDA, MPS) are mentioned. However, no requirements.txt, Dockerfile, or detailed software environment listing library versions is provided. Mentioning PyTorch and llama-bench without version pinning is insufficient." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "While the GitHub repository is linked and Appendix C describes experimental settings (models, precisions, scenarios), there are no step-by-step reproduction instructions (README with commands, scripts to replicate experiments)." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "No confidence intervals or error bars are reported. All results are point estimates — single TPS values in tables and single data points on Roofline plots." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper makes comparative claims (e.g., 'MLA consistently achieves the highest operational intensity and attainable performance') based solely on comparing numbers, with no statistical significance tests." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "Results are reported as absolute values (GFLOPS, TPS) without standardized effect sizes. Comparative claims lack effect size quantification — no percentage improvements or relative magnitudes are systematically reported." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "No justification is given for why this particular set of models, hardware platforms, or sequence length configurations was chosen. The number of repeated measurements per benchmark run is not stated." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Tables 3-5 show single TPS values. Run-to-run variability in inference benchmarks can be significant but is not addressed." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Multiple models serve as comparisons: Qwen2.5, Llama 3.2, PLM, SmolLM2, Fox-1, Qwen3, and Pythia family. Different attention architectures (MHA, GQA, MLA) are compared head-to-head." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "Models tested are contemporary: Qwen2.5 (2024), Llama 3.2 (2024), PLM (2025), Qwen3 (2025), SmolLM2 (2024). Hardware includes recent platforms (M1 Pro, RTX 3090, Jetson Orin Nano Super)." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper systematically varies individual factors: model depth (2-64 layers, Sec 4.2), numerical precision (FP16/Q8_0/Q4_K_M, Sec 4.3.1), and attention mechanism (MHA/GQA/MLA, Sec 4.3.2), isolating each factor's contribution." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Multiple metrics are reported: Operational Intensity (FLOPs/Byte), Performance (GFLOPS/sec), Relative Inference Potential (Φ), and throughput (TPS for both prefilling and decoding)." 89 }, 90 "human_evaluation": { 91 "applies": false, 92 "answer": false, 93 "justification": "This is a systems performance profiling paper measuring hardware utilization metrics. Human evaluation is structurally irrelevant to claims about inference throughput and operational intensity." 94 }, 95 "held_out_test_set": { 96 "applies": false, 97 "answer": false, 98 "justification": "This is a systems benchmarking paper measuring inference performance, not a learning task with train/test splits. The concept of held-out test sets does not apply." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Results are extensively broken down by model architecture, hardware platform (5 devices), precision level (3 formats), sequence scenario (SISO/SILO/LISO/LILO), and layer count. Figures 2-6 and Tables 3-5 provide granular breakdowns." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper identifies the OI regression at increased depth (Insight 2), the 'efficiency trap' from hardware heterogeneity (Insight 5), GQA performing worst among the three architectures at 1.5B scale, and the severe memory-bound regime in SILO scenarios." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Several negative findings are reported: OI regression beyond 3-5 layers despite increasing performance (Sec 4.2), GQA exhibiting the lowest efficiency among three attention mechanisms (Sec 4.3.2), and quantization providing diminishing returns in compute-heavy scenarios (Sec 4.3.1)." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "All abstract claims are supported: (1) OI varies with sequence length (Fig 2, Sec 4.1), (2) OI regression with depth (Fig 3, Sec 4.2), (3) efficiency trap from hardware heterogeneity (Fig 5, Sec 5.1), (4) MLA unlocks inference potential (Fig 6, Sec 4.3.2)." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "Causal claims ('sequence length determines OI', 'MLA effectively compresses KV cache traffic') are supported by controlled single-variable experiments. Layer scaling holds architecture constant, attention comparisons use matched ~1.5B parameter counts. The study design adequately supports the causal direction." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": true, 130 "justification": "Claims are generally scoped to the tested configurations. The paper titles itself 'On-Device LLMs' and tests across 5 representative platforms. Future Work (Appendix B) explicitly acknowledges the need to expand to MoE architectures, more hardware, and different inference engines." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper presents its interpretations of the results (memory wall, KV cache compression) but does not systematically consider alternative explanations. For example, the OI regression could be influenced by software framework overhead rather than purely physical constraints, but this is not explored." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper's claims match the granularity of its measurements. It measures FLOPs, throughput, and OI, and claims are about inference performance and hardware utilization — no proxy gap exists." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": true, 147 "justification": "Specific model versions are provided: 'Qwen2.5-1.5B-Instruct', 'Llama-3.2-1B-Instruct', 'PLM-1.8B-Instruct', 'SmolLM2-1.7B-Instruct', 'Fox-1-1.6B', 'Qwen3-0.6B'. For open-weight models, the model name with size is the version." 148 }, 149 "prompts_provided": { 150 "applies": false, 151 "answer": false, 152 "justification": "This paper does not use prompting. It runs inference benchmarks via llama-bench with parameterized sequence lengths, not prompt-based evaluation." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": true, 157 "justification": "Key parameters are reported: precisions (FP16, Q8_0, Q4_K_M), sequence length configurations (SISO/SILO/LISO/LILO with specific lengths), layer counts (2-64), thread counts, and hardware configurations (Table 1)." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "No agentic scaffolding is used. The paper runs inference benchmarks directly via llama-bench." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Appendix C.3 documents the data pipeline: hardware characterization via synthetic benchmarks, FLOPs calculation methodology (Table 2), memory monitoring strategy per platform, inference workflow with parameterized llama-bench runs, and automated post-hoc analysis from synchronized logs." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Appendix B ('Future Work') provides substantive discussion of limitations: need to expand to MoE architectures, limited hardware diversity, and software stack variance not investigated. While titled 'Future Work', it functions as a dedicated limitations discussion." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Appendix B identifies specific threats: (1) FLOPs estimation for MoE with stochastic routing is a formidable challenge not addressed, (2) different inference engines (TensorRT LLM, vLLM, ONNX Runtime) introduce substantial variance, (3) only dense architectures are covered. These are specific to this study." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": true, 184 "justification": "Appendix B explicitly states what was NOT tested: MoE architectures, broader hardware range, and different software stacks. The paper also explicitly scopes to 'dense architectures' in the discussion." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "Raw benchmark logs and measurement data are not provided for download. Only aggregated results appear in tables and figures. The GitHub repo appears to contain benchmarking code but not raw data." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Appendix C.3 describes data collection in detail: hardware characterization via PyTorch matrix operations, inference via llama-bench with specific parameters, memory monitoring through platform-specific strategies (RSS sampling, nvidia-smi XML parsing), and timestamp-synchronized logging." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. The 'subjects' are well-defined hardware platforms and publicly available model architectures — no recruitment process applies." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "Appendix C.3 documents the full pipeline: hardware profiling → parameterized inference runs → concurrent memory monitoring → JSON-formatted inference logs → automated post-hoc analysis. The FLOPs calculation formulas are provided in Table 2." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding or acknowledgments section is present in the paper. There is no mention of funding sources or grants." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are listed: Huzhou University, Banbu AI Foundation, Chinese Academy of Sciences, Carnegie Mellon University, and University of Edinburgh." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "Authors affiliated with Banbu AI Foundation evaluate PLM (Deng et al., 2025), which is Banbu AI's own model. PLM (using MLA) is consistently shown as the best-performing architecture. The funder has a direct interest in PLM performing well." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial disclosures statement is present. The paper does not declare whether authors hold patents, equity, or financial interests related to PLM or Banbu AI Foundation." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": false, 234 "answer": false, 235 "justification": "This paper measures inference performance (throughput, FLOPS, OI), not model capability or accuracy on benchmarks. Training data contamination is irrelevant to hardware performance profiling." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": false, 239 "answer": false, 240 "justification": "No model capability is being evaluated — the paper profiles inference performance characteristics. Train/test overlap has no bearing on throughput measurements." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": false, 244 "answer": false, 245 "justification": "No benchmark knowledge contamination risk exists. The paper measures hardware utilization and inference speed, not whether models can answer benchmark questions correctly." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study. It is a systems performance benchmarking paper." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants. No ethics review is needed for hardware benchmarking." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": true, 289 "justification": "The paper's core contribution IS reporting inference performance: throughput (TPS) in Tables 3-5, GFLOPS across all experiments, latency characteristics across scenarios. Inference cost/performance is the primary output." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "Hardware used is specified (Table 1), but the total computational budget for running all experiments (GPU hours, wall-clock time for the full benchmark suite) is not stated." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "No mention of multiple random seeds or run-to-run variability. Inference benchmarks can exhibit variance from system load, thermal throttling, and memory state, but this is not addressed." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "The number of experimental runs per configuration is not stated. It is unclear whether results are from single runs or averaged across multiple runs." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": false, 310 "answer": false, 311 "justification": "No hyperparameter search was performed. The paper systematically evaluates fixed configurations (models, precisions, scenarios, layer counts) without optimization." 312 }, 313 "best_config_selection_justified": { 314 "applies": false, 315 "answer": false, 316 "justification": "The paper does not select a 'best' configuration — it characterizes all configurations systematically across the design space." 317 }, 318 "multiple_comparison_correction": { 319 "applies": false, 320 "answer": false, 321 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "Authors from Banbu AI Foundation evaluate PLM (their own model) as the MLA representative, and it consistently outperforms alternatives. The paper does not acknowledge the bias of evaluating their own system or discuss this potential conflict." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": true, 331 "justification": "The paper explicitly controls for model scale by comparing attention mechanisms at matched ~1.5B parameters (Sec 4.3.2: 'all LLMs are scaled to a unified size of approximately 1.5B parameters by adjusting their respective layer counts'). Performance is shown across matched compute conditions." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": true, 336 "justification": "The paper thoroughly discusses what its metrics measure: OI is analytically defined (Eq. 3-5), the Roofline model is a well-established performance analysis framework (Williams et al., 2009), and the relationship between measured metrics and hardware limits is formally established." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": false, 340 "answer": false, 341 "justification": "No scaffolding is involved. The paper benchmarks raw inference performance via llama-bench." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": false, 347 "answer": false, 348 "justification": "This paper measures inference performance (throughput, FLOPS), not model knowledge or accuracy. Temporal leakage concepts do not apply to hardware performance profiling." 349 }, 350 "feature_leakage_addressed": { 351 "applies": false, 352 "answer": false, 353 "justification": "No prediction task is involved. The paper profiles hardware utilization during inference, where feature leakage is structurally irrelevant." 354 }, 355 "non_independence_addressed": { 356 "applies": false, 357 "answer": false, 358 "justification": "No train/test data split exists in this performance benchmarking study. Independence of data splits is not applicable." 359 }, 360 "leakage_detection_method": { 361 "applies": false, 362 "answer": false, 363 "justification": "No leakage risk exists in performance profiling. Detection methods are not applicable." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "Context length is the primary factor determining both the operational intensity and performance of on-device decoding, with LISO approaching compute-bound and SILO remaining severely memory-bound.", 370 "evidence": "Figure 2 shows LISO (yellow dots) consistently achieving highest OI and proximity to the Roofline ridge point across all models, while SILO (red dots) remains deep in the memory-bound regime. Demonstrated across Qwen2.5, Llama 3.2, PLM, and SmolLM2 on Apple M1 Pro (Sec 4.1).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "On-device decoding reaches peak operational intensity at a remarkably shallow depth (3-5 layers), beyond which OI regresses due to memory bandwidth overhead.", 375 "evidence": "Figure 3 shows a non-monotonic 'arch' trajectory of OI as layer count increases from 2 to 64 across FP16, Q8_0, and Q4_K_M precisions on Apple M1 Pro. The inflection occurs at 3-5 layers (Sec 4.2).", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Quantization provides maximal efficiency gains for memory-bound tasks but yields diminishing returns in compute-heavy scenarios.", 380 "evidence": "Figure 4 shows FP16→Q4_K_M yields dramatic OI and GFLOPS gains in SILO (memory-bound) but tight clustering in LISO (near compute-bound). Demonstrated across all 6 model architectures (Sec 4.3.1).", 381 "supported": "strong" 382 }, 383 { 384 "claim": "MLA outperforms both MHA and GQA by effectively compressing KV cache traffic to maximize operational intensity on resource-constrained devices.", 385 "evidence": "Figure 6 shows MLA (PLM, blue dots) consistently achieving highest OI and GFLOPS across all four sequence scenarios at matched ~1.5B parameters on Apple M1 Pro. Cross-platform comparison in Figure 5 (right) confirms on RTX 3070Ti (Sec 4.3.2, 5.2).", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Hardware heterogeneity creates an 'efficiency trap' where a single model architecture cannot achieve uniform utilization equity across platforms.", 390 "evidence": "Figure 5 (left) shows ridge points spanning 8.98 (Raspberry Pi 5) to 38.00 (RTX 3090) FLOPs/Byte, with peak performance spanning three orders of magnitude. The same 1.5B model operates in different bottleneck regimes on different hardware (Sec 5.1).", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Architectural optimization (MLA) demonstrates consistent cross-platform robustness, maintaining a higher baseline of operational intensity across hardware tiers.", 395 "evidence": "Figure 5 (right) shows MLA maintaining superior position on both Apple M1 Pro and RTX 3070Ti Laptop in LISO scenario, with a parallel shift toward upper-right rather than convergence (Sec 5.2).", 396 "supported": "moderate" 397 } 398 ], 399 "methodology_tags": ["benchmark-eval"], 400 "key_findings": "RooflineBench characterizes LLM inference on edge devices using the Roofline model, revealing that context length is the primary determinant of operational intensity and that OI peaks at 3-5 layers before regressing. The framework identifies an 'efficiency trap' from hardware ridge-point disparities across heterogeneous platforms. Multi-head Latent Attention (MLA) consistently achieves the highest operational intensity and cross-platform robustness compared to MHA and GQA at matched model scales, demonstrating how architectural refinements can bridge the gap between theoretical potential and real-world execution.", 401 "red_flags": [ 402 { 403 "flag": "Self-evaluation conflict of interest", 404 "detail": "Authors affiliated with Banbu AI Foundation evaluate PLM (Deng et al., 2025), which is Banbu AI's own model. PLM (MLA) is consistently shown as the best-performing architecture throughout the paper. This conflict is never disclosed or acknowledged." 405 }, 406 { 407 "flag": "No error bars or run-to-run variance", 408 "detail": "All results are single point estimates without any uncertainty quantification. Inference benchmarks can vary significantly due to system load, thermal throttling, memory state, and OS scheduling. The paper makes strong comparative claims ('MLA consistently achieves the highest') without statistical validation." 409 }, 410 { 411 "flag": "Potentially unfair MLA comparison", 412 "detail": "The MLA representative is PLM-1.8B while GQA uses Qwen2.5-1.5B. Although matched to ~1.5B by adjusting layer counts, the architectures differ in hidden dimension, head count, and other design choices beyond just the attention mechanism. Attributing the performance difference solely to the attention type oversimplifies the comparison." 413 } 414 ], 415 "cited_papers": [ 416 { 417 "title": "PaLM: Scaling language modeling with pathways", 418 "authors": ["Aravind Chowdhery"], 419 "year": 2023, 420 "relevance": "Introduces Model FLOPs Utilization (MFU) metric used to evaluate LLM training and inference efficiency." 421 }, 422 { 423 "title": "Efficiently scaling transformer inference", 424 "authors": ["Reiner Pope"], 425 "year": 2023, 426 "relevance": "Analytical framework for Transformer inference that RooflineBench builds upon for decomposing prefill and decoding phases." 427 }, 428 { 429 "title": "FlashAttention: Fast and memory-efficient exact attention with IO-awareness", 430 "authors": ["Tri Dao"], 431 "year": 2022, 432 "arxiv_id": "2205.14135", 433 "relevance": "IO-aware attention optimization for memory-efficient inference, directly relevant to on-device LLM efficiency." 434 }, 435 { 436 "title": "DeepSeek-V2: A strong, economical, and efficient mixture-of-experts language model", 437 "authors": ["DeepSeek-AI"], 438 "year": 2024, 439 "arxiv_id": "2405.04434", 440 "relevance": "Introduces Multi-head Latent Attention (MLA) evaluated in RooflineBench as the most efficient attention architecture." 441 }, 442 { 443 "title": "MiniCPM: Unveiling the potential of small language models with scalable training strategies", 444 "authors": ["Shengding Hu"], 445 "year": 2024, 446 "arxiv_id": "2404.06395", 447 "relevance": "Small language model with trainable sparse attention, demonstrating efficient on-device inference capabilities." 448 }, 449 { 450 "title": "Phi-3 technical report: A highly capable language model locally on your phone", 451 "authors": ["Marah I. Abdin"], 452 "year": 2024, 453 "arxiv_id": "2404.14219", 454 "relevance": "Demonstrates compact LLM architecture capable of on-device deployment, directly relevant to on-device intelligence evaluation." 455 }, 456 { 457 "title": "AWQ: activation-aware weight quantization for on-device LLM compression and acceleration", 458 "authors": ["Ji Lin"], 459 "year": 2024, 460 "relevance": "Post-training quantization technique for on-device LLM inference, relevant to efficiency-focused evaluation." 461 }, 462 { 463 "title": "GPTQ: accurate post-training quantization for generative pre-trained transformers", 464 "authors": ["Elias Frantar"], 465 "year": 2022, 466 "arxiv_id": "2210.17323", 467 "relevance": "Post-training quantization method for reducing LLM memory footprint, directly relevant to on-device deployment." 468 }, 469 { 470 "title": "LLM in a flash: Efficient large language model inference with limited memory", 471 "authors": ["Keivan Alizadeh"], 472 "year": 2024, 473 "doi": "10.18653/V1/2024.ACL-LONG.678", 474 "relevance": "Addresses memory-constrained LLM inference showing the 'memory wall' bottleneck for generative models on edge devices." 475 }, 476 { 477 "title": "MobileLLM: Optimizing sub-billion parameter language models for on-device use cases", 478 "authors": ["Zechun Liu"], 479 "year": 2024, 480 "relevance": "Compact architecture optimized for mobile deployment, representative of the SLM trend evaluated in this paper." 481 }, 482 { 483 "title": "Holistic evaluation of language models", 484 "authors": ["Percy Liang"], 485 "year": 2023, 486 "relevance": "Comprehensive multi-metric LLM evaluation framework (HELM), relevant as a capability-centric benchmark that contrasts with system-level evaluation." 487 }, 488 { 489 "title": "MLPerf inference benchmark", 490 "authors": ["Vijay Janapa Reddi"], 491 "year": 2019, 492 "relevance": "Industry-standard inference benchmarking protocol for heterogeneous hardware, foundational for system-level AI evaluation." 493 } 494 ] 495 }