scan.json (24901B)
1 { 2 "paper": { 3 "title": "Theoretical Foundations of Scaling Law in Familial Models", 4 "authors": [ 5 "Huan Song", 6 "Qingfei Zhao", 7 "Ting Long", 8 "Shuyu Tian", 9 "Hongjun An", 10 "Jiawei Shao", 11 "Xuelong Li" 12 ], 13 "year": 2025, 14 "venue": "arXiv", 15 "arxiv_id": "2512.23407", 16 "doi": "10.48550/arXiv.2512.23407" 17 }, 18 "scan_version": 2, 19 "active_modules": [], 20 "methodology_tags": ["theoretical"], 21 "key_findings": "This paper extends neural scaling laws to Familial Models (multi-exit architectures) by introducing granularity G as a third scaling variable alongside model size N and data D. The fitted law L(N,D,G) reveals the granularity exponent γ ≈ 0.033 is negligibly small, meaning multiple exit points incur minimal training loss overhead. A branch-level scaling law shows upstream branches have virtually no impact on downstream performance (α = 10⁻³ vs β = 0.04). Efficiency Leverage analysis confirms Familial Models outperform size-matched dense models under equal FLOPs budgets across all tested compute regimes (10¹⁹–10²¹ FLOPs).", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": false, 27 "justification": "No code repository, GitHub link, or archive URL is provided anywhere in the paper." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": false, 32 "justification": "No experimental data (loss measurements, training configurations, fitted curves) is released. The training dataset used for model pre-training is not even identified." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "No environment specifications, hardware descriptions, library versions, or dependency files are provided." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No reproduction instructions are provided. The experimental design is described at a high level (IsoFLOP, model sizes, granularities) but step-by-step reproduction details are absent." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "Fitted scaling law parameters (E, A, B, α, β, γ) are reported as point estimates without confidence intervals or error bars. No uncertainty quantification on the fitted parameters." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper claims γ ≈ 0.0333 is 'extremely small' and α = 10⁻³ is 'negligible' without any formal statistical tests. No tests are used to determine whether these parameters are significantly different from zero." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "The Efficiency Leverage metric (EL = L_Dense/L_Fam) directly quantifies the performance advantage as a ratio across FLOPs budgets (Section 4.4, Figure 7). Branch penalty coefficients are compared (α = 10⁻³ vs β = 0.0397, a ~40x difference) providing concrete effect magnitudes with baseline context." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "The paper mentions fitting to 'over 100 experimental configurations' but does not justify why this number is sufficient for reliable parameter estimation of a 6-parameter model." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "Each experimental configuration yields a single loss value. No variance across random seeds, no standard deviations, no repeated runs are reported." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Dense model baselines with matched parameter counts and compute budgets are included throughout (Tables 1–2, Figures 3–4). The entire branch scaling law (Eq. 3–4) is defined relative to dense baselines." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "The dense transformer baselines are the appropriate contemporary comparison for multi-exit architectures. The scaling law framework builds on Chinchilla (Hoffmann et al., 2022) and recent corrections (Pearce & Song 2024, Porian et al. 2024)." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "The branch scaling law analysis (Section 4.3) systematically varies the number of upstream (P) and downstream (Q) branches while measuring their independent effects on loss. The IsoFLOP design varies N, D, and G independently. Figures 3–6 show the individual branch-level effects." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": false, 91 "justification": "The only measured quantity is training loss (cross-entropy). Efficiency Leverage (EL) is derived entirely from loss values. No downstream task evaluations, perplexity on held-out data, or other independent metrics are reported." 92 }, 93 "human_evaluation": { 94 "applies": false, 95 "answer": false, 96 "justification": "Human evaluation is irrelevant to a scaling law study that measures training loss curves across architectural configurations." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": false, 101 "justification": "Figures 3 and 4 label the y-axis 'Training loss.' The paper does not mention held-out validation or test sets for measuring the scaling law relationships." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Results are broken down by granularity level (G = 1–4, Figures 1, 3–7), by branch position (first vs second branch point, Figures 3–6), by model family size (fam2B, fam4B, fam8B, fam12B), and by FLOPs budget." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": false, 111 "justification": "The paper presents uniformly positive results. No discussion of where the scaling law breaks down, fits poorly, or where Familial Models underperform dense models." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": false, 116 "justification": "Every experiment shows Familial Models performing favorably. The downstream branch penalty (β = 0.0397) is presented as a minor cost rather than a negative finding. No failed configurations, poor fits, or abandoned approaches are reported." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "Abstract claims that γ ≈ 0.033 (supported by Eq. 2), EL > 1 across all compute regimes (supported by Figure 7), and advantage most pronounced in low-compute regime (supported by Figure 7) are all backed by results in the paper." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "Claims like 'additional upstream branches have negligible impact on performance' are supported by the IsoFLOP experimental design which controls compute budget while varying architectural parameters. The controlled single-variable manipulation (varying P and Q independently) is adequate for the causal claims made." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "The title claims 'Theoretical Foundations' but experiments cover only 1B–12B parameters and 10¹⁹–10²¹ FLOPs. The conclusion states the architecture 'effectively addresses the demand for diverse deployment scales' without bounding this to the tested regime. No discussion of whether the scaling law holds at larger scales where LLMs are typically deployed (70B+, 10²³+ FLOPs)." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "No alternative explanations are discussed. The paper does not consider whether the small γ might be an artifact of the specific training data, architecture choices (equal exit weights), or the limited scale range tested." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper measures training loss but frames results in terms of 'deployment flexibility,' 'engineering practice,' and 'diverse application requirements' (Conclusion). The gap between training loss and actual deployment performance (downstream task accuracy, latency, real-world utility) is never acknowledged." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": true, 150 "justification": "Full architectural specifications are provided in Tables 1–2: d_model, ffn_size, num_attention_heads, n_layers, and exit_layer configurations for all model variants (1B–4B dense, 2B–4B family)." 151 }, 152 "prompts_provided": { 153 "applies": false, 154 "answer": false, 155 "justification": "The paper trains language models from scratch; no prompting is involved." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": false, 160 "justification": "Training hyperparameters (learning rate, batch size, optimizer, warmup schedule, weight decay) for model pre-training are not reported. Only the scaling law fitting procedure hyperparameters (Huber δ = 10⁻³, L-BFGS, grid initialization ranges) are described in Section 3.2." 161 }, 162 "scaffolding_described": { 163 "applies": false, 164 "answer": false, 165 "justification": "No agentic scaffolding is used. This is a model training and scaling law study." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": false, 170 "justification": "The training dataset is never identified. No mention of what text corpus was used, how it was preprocessed, tokenized, or filtered. This is a critical omission for reproducing the experiments." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": false, 177 "justification": "There is no limitations section. The paper has Introduction, Preliminaries, Scaling Law with Granularity, Results, Discussion, and Conclusion. The Discussion section discusses potential extensions and applications but not limitations." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": false, 182 "justification": "No threats to validity are discussed. The paper does not address potential issues like overfitting the scaling law to a narrow parameter range, sensitivity to the chosen functional form, or whether equal exit weights are optimal." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": false, 187 "justification": "No explicit scope boundaries are stated. The paper does not discuss what settings would fall outside the scaling law's validity or acknowledge that results are limited to 1B–12B models at 10¹⁹–10²¹ FLOPs." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": false, 194 "justification": "No raw experimental data (loss values per training run, per-step loss curves) is made available for independent verification." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": false, 199 "justification": "The IsoFLOP experimental design is described (Section 3.3, Tables 1–2), but the training data corpus is never identified. Without knowing what data the models were trained on, the experimental procedure is incomplete." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. Data comes from the authors' own controlled training experiments, not from external benchmarks or recruited participants." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": false, 209 "justification": "The scaling law fitting pipeline is documented (log-domain decomposition, Huber loss, L-BFGS in Section 3.2), but the model training pipeline (data preprocessing, tokenization, training procedure) is not documented." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding source or acknowledgments section is present. The authors are from China Telecom's AI institute but no funding details are provided." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are clearly stated: 'Institute of Artificial Intelligence (TeleAI), China Telecom' appears under the title." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "The work is conducted at China Telecom's AI institute, which would benefit commercially from validating the 'Familial Models' paradigm for efficient edge-cloud deployment. The employer has a financial interest in the outcome. This conflict is not acknowledged." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial interests statement is present in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": false, 237 "answer": false, 238 "justification": "The paper trains models from scratch and measures training loss. It does not evaluate a pre-trained model's capability on any benchmark." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": false, 242 "answer": false, 243 "justification": "No pre-trained model is evaluated on benchmarks. The experiments are controlled training runs measuring training loss." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": false, 247 "answer": false, 248 "justification": "No benchmark evaluation is performed. Models are trained from scratch under IsoFLOP constraints and training loss is measured." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "The paper motivates Familial Models by deployment efficiency and 'varying constraints on latency and computational cost' (Section 1) but never measures actual inference cost, latency, or throughput for any of the trained models." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": true, 297 "justification": "Training compute budgets are explicitly stated: 'fixed budgets (10¹⁹–10²¹ FLOPs)' (Section 1, Section 3.3). The IsoFLOP experimental design is built around controlled compute budgets." 298 } 299 } 300 }, 301 "claims": [ 302 { 303 "claim": "The granularity exponent γ ≈ 0.0333 is extremely small, meaning supporting multiple exit points imposes negligible overhead on training loss.", 304 "evidence": "Fitted scaling law Eq. 2 in Section 4.1: L(N,D,G) = (1.0059 + 403.4289/N^0.2982 + 2980.058/D^0.3412) · G^0.0333. Fitted across IsoFLOP experiments at 10¹⁹–10²¹ FLOPs. Figure 1 shows only mild upward shift of loss surfaces across G = 1–4.", 305 "supported": "moderate" 306 }, 307 { 308 "claim": "Additional branch points before a given exit have only a negligible effect on its loss (α = 10⁻³), while downstream branches have a much larger impact (β = 0.0397).", 309 "evidence": "Branch scaling law Eq. 4 fitted to over 100 experimental configurations (Section 4.3). Figures 3–6 show the branch-level loss comparisons across granularity settings and FLOPs budgets.", 310 "supported": "moderate" 311 }, 312 { 313 "claim": "Efficiency Leverage (EL) remains strictly greater than 1 across all compute budgets, meaning Familial Models achieve lower loss than dense models under matched FLOPs.", 314 "evidence": "Section 4.4, Figure 7 shows EL > 1 for G ∈ {3,4,5,6} across the tested FLOPs range. However, EL is computed from the fitted scaling law rather than from direct experimental comparisons.", 315 "supported": "moderate" 316 }, 317 { 318 "claim": "The EL advantage is most pronounced in the low-compute regime.", 319 "evidence": "Figure 7 shows EL curves declining as FLOPs increase, with highest values at the low end of the 10¹⁹–10²¹ FLOPs range.", 320 "supported": "moderate" 321 }, 322 { 323 "claim": "The 'train once, deploy many' paradigm is validated — Familial Models can produce multiple sub-models from a single training run without compromising compute-optimality.", 324 "evidence": "Combination of small γ (Eq. 2) and EL > 1 (Section 4.4). However, validation is limited to training loss; no downstream task evaluation or actual deployment testing is conducted.", 325 "supported": "weak" 326 } 327 ], 328 "red_flags": [ 329 { 330 "flag": "Training data not disclosed", 331 "detail": "The paper never identifies what text corpus was used to train the language models. Without knowing the training data, the experiments cannot be reproduced and it is impossible to assess whether the scaling law is data-dependent." 332 }, 333 { 334 "flag": "Self-evaluation of own architecture", 335 "detail": "The Familial Models architecture originates from An et al. (2025), a co-authored paper by some of the same authors (Hongjun An, Jiawei Shao, Xuelong Li). China Telecom researchers are evaluating their own group's proposed architecture and finding it superior, without independent validation." 336 }, 337 { 338 "flag": "No error bars or uncertainty on fitted parameters", 339 "detail": "All six fitted parameters (E, A, B, α, β, γ) are reported as point estimates. Without confidence intervals, it is impossible to assess whether γ is statistically distinguishable from 0 or whether the claimed EL > 1 is robust to fitting uncertainty." 340 }, 341 { 342 "flag": "No limitations section", 343 "detail": "The paper has no dedicated limitations section and never discusses when the scaling law might break down, the narrow parameter range tested (1B–12B, 10¹⁹–10²¹ FLOPs), or potential confounds." 344 }, 345 { 346 "flag": "Training loss only — no downstream evaluation", 347 "detail": "All claims about deployment flexibility and engineering practice are based solely on training loss. The paper never evaluates the sub-models on downstream tasks, measures inference latency, or demonstrates actual deployment." 348 }, 349 { 350 "flag": "Missing training hyperparameters", 351 "detail": "Learning rate, batch size, optimizer, warmup schedule, and other training hyperparameters are not reported. Only the scaling law fitting procedure is described. This makes the experiments unreproducible." 352 }, 353 { 354 "flag": "Narrow scale range with broad claims", 355 "detail": "Experiments cover 1B–12B parameters at 10¹⁹–10²¹ FLOPs, but the paper claims to establish 'Theoretical Foundations' and validate the paradigm for 'ubiquitous intelligence.' No evidence the scaling law holds at scales where LLMs are typically deployed (70B+, 10²³+ FLOPs)." 356 } 357 ], 358 "cited_papers": [ 359 { 360 "title": "Scaling laws for neural language models", 361 "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan", "Tom B Brown", "Benjamin Chess", "Rewon Child", "Scott Gray", "Alec Radford", "Jeffrey Wu", "Dario Amodei"], 362 "year": 2020, 363 "arxiv_id": "2001.08361", 364 "relevance": "Foundational work characterizing LLM test loss as a predictable power-law function of model size, dataset size, and compute budget." 365 }, 366 { 367 "title": "Training compute-optimal large language models", 368 "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"], 369 "year": 2022, 370 "arxiv_id": "2203.15556", 371 "relevance": "Established the Chinchilla scaling law advocating proportional scaling of parameters and data, the methodological basis for this paper's IsoFLOP approach." 372 }, 373 { 374 "title": "LoRA: Low-rank adaptation of large language models", 375 "authors": ["Edward J Hu", "Yelong Shen", "Phillip Wallis", "Zeyuan Allen-Zhu"], 376 "year": 2022, 377 "relevance": "Parameter-efficient fine-tuning via low-rank matrix approximation, used as a building block in the Familial Models architecture." 378 }, 379 { 380 "title": "vLLM: Easy, fast, and cheap LLM serving with PagedAttention", 381 "authors": ["Woosuk Kwon", "Zhuohan Li", "Siyuan Zhuang"], 382 "year": 2023, 383 "relevance": "Efficient LLM serving infrastructure addressing the deployment cost constraints that motivate Familial Models." 384 }, 385 { 386 "title": "Reconciling Kaplan and Chinchilla scaling laws", 387 "authors": ["Tim Pearce", "Jinyeop Song"], 388 "year": 2024, 389 "arxiv_id": "2406.12907", 390 "relevance": "Recent replication and correction of foundational scaling law studies, reaffirming compute-optimal frontier validity." 391 }, 392 { 393 "title": "Resolving discrepancies in compute-optimal scaling of language models", 394 "authors": ["Tomer Porian", "Mitchell Wortsman", "Jenia Jitsev", "Ludwig Schmidt", "Yair Carmon"], 395 "year": 2024, 396 "relevance": "Identified methodological flaws in the original Chinchilla study and provided corrected confidence intervals for scaling law parameters." 397 }, 398 { 399 "title": "Towards greater leverage: Scaling laws for efficient mixture-of-experts language models", 400 "authors": ["Changxin Tian", "Kunlong Chen", "Jia Liu"], 401 "year": 2025, 402 "arxiv_id": "2507.17702", 403 "relevance": "Introduced the Efficiency Leverage metric for MoE models that this paper adapts for Familial Models; directly inspired the architectural deconstruction methodology." 404 }, 405 { 406 "title": "Scaling laws for fine-grained mixture of experts", 407 "authors": ["Jakub Krajewski", "Jan Ludziejewski", "Kamil Adamczewski"], 408 "year": 2024, 409 "arxiv_id": "2402.07871", 410 "relevance": "Scaling law analysis for MoE architectures showing how expert granularity affects efficiency, closely related to this paper's granularity analysis." 411 }, 412 { 413 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 414 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 415 "year": 2023, 416 "arxiv_id": "2305.05176", 417 "relevance": "Cost-efficient LLM usage strategies relevant to the practical motivation of deploying multiple model sizes." 418 }, 419 { 420 "title": "Any-precision LLM: Low-cost deployment of multiple, different-sized LLMs", 421 "authors": ["Yeonhong Park", "Jake Hyun", "SangLyul Cho", "Bonggeun Sim", "Jae W Lee"], 422 "year": 2024, 423 "arxiv_id": "2402.10517", 424 "relevance": "Alternative approach to deploying multiple LLM sizes from single training, directly comparable to the Familial Models paradigm." 425 }, 426 { 427 "title": "Matryoshka representation learning", 428 "authors": ["Aditya Kusupati", "Gantavya Bhatt", "Aniket Rege"], 429 "year": 2022, 430 "relevance": "Flexible representation learning with nested sub-models sharing aligned features, conceptually related to the Familial Models multi-exit approach." 431 } 432 ] 433 }