scan.json (23295B)
1 { 2 "paper": { 3 "title": "Relative-Based Scaling Law for Neural Language Models", 4 "authors": ["Baoqing Yue", "Jinyuan Zhou", "Zixi Wei", "Jingtao Zhan", "Qingyao Ai", "Yiqun Liu"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2510.20387", 8 "doi": "10.48550/arXiv.2510.20387" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval", "theoretical"], 13 "key_findings": "The paper proposes Relative-Based Probability (RBP), a metric measuring whether the correct token ranks in the model's top-k predictions, and establishes a power-law scaling relationship between -log(RBP_k) and model size for small k. Experiments across 4 model families (GPT-2, OPT, Pythia, Qwen2.5) and 4+ datasets show R² ≥ 0.97 for k ≤ 100. The law breaks down when k approaches vocabulary size. The authors use the law to explain emergence phenomena as a macroscopic consequence of smooth token-level scaling and conjecture a lognormal rank distribution unifying cross-entropy and RBP scaling laws.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "Footnote 1 states 'Code is available at https://github.com/ybq22/relative-based-scaling-law.'" 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses publicly available datasets: Wikipedia, HumanEval, HotpotQA, Open Australian Legal Corpus, C4, and Pile. All are referenced with public URLs or citations." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specifications, requirements.txt, or dependency details are provided in the paper." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but the paper itself contains no reproduction guide." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Results are reported as point estimates (R² values, slopes) with no confidence intervals or error bars on the fitted parameters." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "No statistical significance tests are used. Claims of power-law fit rely solely on R² values without testing whether the fit is significantly better than alternatives." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Scaling exponents (slopes) are reported numerically across all conditions in Table 1, providing magnitude context for the scaling relationships (e.g., slope = -0.087 for cross-entropy, -0.079 for RBP1 on Wiki)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for why 24 models across 4 families is sufficient to establish the scaling law. The paper states 'at least 5×10⁵ tokens' per data point but does not justify this threshold." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance or standard deviation reported across runs or token samples. Single-point estimates throughout." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Cross-entropy scaling law serves as the baseline comparison throughout. The paper directly compares R² and slopes between CE-based and RBP-based scaling laws (Figure 8a, Table 1)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Qwen2.5 (2024) is included as a contemporary model family, and the paper references Chinchilla (Hoffmann et al., 2022) and other recent scaling law work." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper systematically varies k across three regimes (k=1, moderate k, large k) to show which conditions support the scaling law, functioning as an ablation of the threshold parameter." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper uses both cross-entropy loss and RBP_k across multiple k values as evaluation metrics, plus R² for fit quality." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is not relevant to establishing mathematical scaling laws for token prediction metrics." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "The paper evaluates on separate datasets (Wikipedia, HumanEval, HotpotQA, AusLegal) that are not used for model training parameter selection. The models are pre-trained and used as-is." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 1 provides per-dataset, per-model-family, and per-k-value breakdowns of R² and slope values, totaling hundreds of individual results." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 4.3 explicitly discusses where the scaling law breaks down (large k regime), with Figure 5 showing the deterioration. GPT-2 anomalous behavior is noted." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The large-k regime (Section 4.3) is an explicitly reported negative result where the power-law relationship fails. Low R² for GPT-2 on Github (0.091 for CE) is also reported in Table 1." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims about power-law scaling of RBP_k, robustness across datasets/model families, and applications to emergence explanation are all supported by the experimental sections." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper claims 'scaling up models fundamentally reshapes the ranking of tokens' (Section 4.1) — a causal claim from observational data. Models differ in architecture, training data, and training procedure, not just size. No controlled experiment isolates model size as the causal factor." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title claims 'Neural Language Models' broadly but experiments are limited to autoregressive decoder-only transformers. No encoder-only, encoder-decoder, or non-transformer architectures are tested. The abstract says 'scaling large language models' without bounding to the tested architectures." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No discussion of alternative explanations for the observed power-law relationships. Could the relationship be an artifact of how these particular model families were trained? No consideration of confounds like training data composition varying with model size." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper clearly distinguishes RBP as measuring top-k ranking probability vs. cross-entropy measuring absolute probability, and explicitly discusses when each is relevant (e.g., RBP for greedy decoding, Section 1 and 3.2)." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific model series and sizes are named: Pythia 14M–12B (Biderman et al., 2023), GPT-2 (Radford et al., 2019), OPT (Zhang et al., 2022), Qwen2.5 0.5B–14B (Yang et al., 2024). These are open-weight models with specific published versions." 143 }, 144 "prompts_provided": { 145 "applies": false, 146 "answer": false, 147 "justification": "The paper does not use prompting. It evaluates next-token prediction on corpora, not prompted generation." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No hyperparameters are reported for the evaluation pipeline (e.g., context length, batch size, tokenization settings). The paper states 'at least 5×10⁵ tokens' but provides no further details." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The paper evaluates raw model predictions on corpora." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "No description of how the evaluation corpora were preprocessed. How were Wikipedia, HumanEval, HotpotQA, and AusLegal tokenized and split? What portions were used? Not documented." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "No dedicated limitations section. The paper acknowledges the large-k breakdown in Section 4.3 but does not have a structured limitations discussion." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity discussed. The paper does not address whether the observed patterns could be artifacts of the specific model families or datasets chosen." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No explicit scope boundaries stated. The paper does not clarify that results are limited to autoregressive decoder-only transformers, or that the lognormal conjecture is unproven." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "Raw per-token prediction data is not released. Only aggregate RBP values and fitted curves are shown." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section A describes model selection (4 families spanning 5 orders of magnitude) and dataset selection (4 benchmarks covering different domains). The RBP computation procedure is clearly defined in Section 3.2." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard public benchmarks and pre-trained models." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The pipeline from corpus to RBP computation is conceptually described but lacks implementation details: how tokens were sampled, what context lengths were used, how the 5×10⁵ token minimum was enforced." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All authors are listed as affiliated with Tsinghua University. No products being evaluated are affiliated with the authors." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the model families used. The paper evaluates pre-trained models on benchmarks like HumanEval without discussing temporal overlap." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether evaluation corpora (Wikipedia, HumanEval, etc.) overlap with the training data of GPT-2, OPT, Pythia, or Qwen models." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "HumanEval was published in 2021; Qwen2.5 was trained well after. Wikipedia is likely in all models' training data. No contamination discussion whatsoever." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference costs or compute time reported despite running 24 models across 6 datasets with millions of tokens each." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No GPU hours, hardware specifications, or total compute budget stated." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No seed sensitivity analysis. The paper uses pre-trained models with deterministic evaluation (next-token prediction), but does not discuss whether different corpus subsamples yield different results." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "No statement of how many runs produced each data point. The paper states 'at least 5×10⁵ tokens' but not whether this was computed once or averaged over multiple samples." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": false, 305 "answer": false, 306 "justification": "No hyperparameter search is involved. The method computes RBP directly from model outputs with k as a parameter, not a tuned hyperparameter." 307 }, 308 "best_config_selection_justified": { 309 "applies": false, 310 "answer": false, 311 "justification": "No configuration selection is involved. The paper reports results across all k values rather than selecting a best one." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical hypothesis tests are performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": false, 320 "answer": false, 321 "justification": "The paper proposes a new metric and law, not a system competing against baselines. There are no re-implementations of competing methods." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": false, 325 "answer": false, 326 "justification": "The paper studies how model performance scales with model size, not compute budget. Compute is not a variable in the experiments." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether the evaluation corpora (Wikipedia, HumanEval, HotpotQA, AusLegal) are appropriate for establishing general scaling laws. Wikipedia may be memorized; HumanEval is a small benchmark." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved. Models are evaluated on raw next-token prediction." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of temporal leakage. HumanEval (2021), HotpotQA (2018), and Wikipedia are all available before training of later models like Qwen2.5." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether evaluation corpora are in the models' training data, which would constitute direct feature leakage for a next-token prediction task." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of overlap between training and evaluation data. Wikipedia is almost certainly in the training data of all models tested." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention methods used despite high contamination risk." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "-log(RBP_k) follows a power-law relationship with model size S for small k (k << vocabulary size)", 365 "evidence": "Figures 3-4 and Table 1 show R² ≥ 0.97 across 4 datasets and 4 model families for k ≤ 100. 24 models spanning 5 orders of magnitude.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "The power-law relationship breaks down when k approaches vocabulary size", 370 "evidence": "Figure 5 shows scattered results for k = 20,000 and 30,000. Section 4.3 discusses the deterioration with GPT-2 showing non-monotonic behavior.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Cross-entropy and -log(RBP_1) have nearly identical scaling exponents", 375 "evidence": "Figure 8a shows slopes of -0.087 (CE) vs -0.079 (RBP_1) on Wiki data. Table 1 shows similar slopes across datasets.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Emergence phenomena can be explained as a macroscopic consequence of smooth token-level scaling under top-k sampling", 380 "evidence": "Section 5.1 derives the sigmoidal emergence curve from the power-law (Equation 8). Figure 7 shows empirical confirmation across multiple k values.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "A lognormal rank distribution can unify cross-entropy and RBP scaling laws", 385 "evidence": "Section 5.2 and Appendix C present the theoretical derivation. Figure 8c shows lognormal fits to empirical rank distributions for Pythia models.", 386 "supported": "weak" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "No contamination analysis", 392 "detail": "The paper evaluates models on Wikipedia (almost certainly in training data) and HumanEval (published 2021, before Qwen2.5 training) without any contamination discussion. If models have memorized evaluation corpora, the RBP measurements may reflect memorization rather than genuine scaling of predictive ability." 393 }, 394 { 395 "flag": "Confounded model comparisons", 396 "detail": "Models within each family differ not only in size but potentially in training data composition, training duration, and hyperparameters. The paper attributes scaling behavior to model size without controlling for these confounds." 397 }, 398 { 399 "flag": "No uncertainty quantification", 400 "detail": "All results are single point estimates with no error bars, confidence intervals, or variance across subsamples. For a paper establishing a 'law', the precision of the fitted parameters is unknown." 401 }, 402 { 403 "flag": "Overclaiming scope", 404 "detail": "Title says 'Neural Language Models' but only autoregressive decoder-only transformers are tested. No encoder models, encoder-decoders, SSMs, or other architectures." 405 } 406 ], 407 "cited_papers": [ 408 { 409 "title": "Scaling laws for neural language models", 410 "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan", "Tom B Brown"], 411 "year": 2020, 412 "arxiv_id": "2001.08361", 413 "relevance": "Foundational scaling law paper establishing cross-entropy power-law relationships." 414 }, 415 { 416 "title": "Training compute-optimal large language models", 417 "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"], 418 "year": 2022, 419 "arxiv_id": "2203.15556", 420 "relevance": "Chinchilla scaling laws for compute-optimal training of LLMs." 421 }, 422 { 423 "title": "Emergent abilities of large language models", 424 "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"], 425 "year": 2022, 426 "arxiv_id": "2206.07682", 427 "relevance": "Defined emergence phenomena in LLMs; this paper proposes an alternative explanation." 428 }, 429 { 430 "title": "Are emergent abilities of large language models a mirage?", 431 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 432 "year": 2023, 433 "relevance": "Argues emergence is a measurement artifact; this paper extends the argument with relative-based metrics." 434 }, 435 { 436 "title": "Pythia: A suite for analyzing large language models across training and scaling", 437 "authors": ["Stella Biderman", "Hailey Schoelkopf"], 438 "year": 2023, 439 "relevance": "Model suite used extensively in this paper's experiments for scaling analysis." 440 }, 441 { 442 "title": "Evaluating large language models trained on code", 443 "authors": ["Mark Chen", "Jerry Tworek"], 444 "year": 2021, 445 "relevance": "HumanEval benchmark used as one of four evaluation datasets." 446 }, 447 { 448 "title": "Scaling laws for autoregressive generative modeling", 449 "authors": ["Tom Henighan", "Jared Kaplan"], 450 "year": 2020, 451 "arxiv_id": "2010.14701", 452 "relevance": "Extended scaling laws beyond language to autoregressive generation broadly." 453 }, 454 { 455 "title": "Scaling laws for dense retrieval", 456 "authors": ["Yan Fang", "Jingtao Zhan", "Qingyao Ai"], 457 "year": 2024, 458 "relevance": "Applied scaling laws to information retrieval; by overlapping authors with this paper." 459 } 460 ] 461 }