scan.json (26722B)
1 { 2 "paper": { 3 "title": "Using Scaling Laws for Data Source Utility Estimation in Domain-Specific Pre-Training", 4 "authors": [ 5 "Oleksiy Ostapenko", 6 "Charles Guille-Escuret", 7 "Luke Kumar", 8 "Max Tian", 9 "Denis Kocetkov", 10 "Gopeshh Subbaraj", 11 "Raymond Li", 12 "Joel Lamy-Poirier", 13 "Sebastien Paquet", 14 "Torsten Scholak" 15 ], 16 "year": 2025, 17 "venue": "arXiv", 18 "arxiv_id": "2507.22250", 19 "doi": "10.48550/arXiv.2507.22250" 20 }, 21 "scan_version": 2, 22 "active_modules": ["experimental_rigor", "data_leakage"], 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper references Fast-LLM (github.com/ServiceNow/Fast-LLM) as the training framework, but does not release experiment code, configuration files, or scripts for reproducing the scaling law analysis." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": false, 33 "justification": "The base datasets (FineWeb-Edu, Dolma) are public, but the paper's curated domain-specific datasets (MBF-filtered data, WRAP-generated data, instruction-augmented data) are not released." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper mentions 64 H100 GPUs, FlashAttention 2, ZeRO stage 3, and Fast-LLM with a specific git SHA (ff1486d), but provides no requirements.txt, Dockerfile, or library version listing sufficient to recreate the environment." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No step-by-step reproduction instructions are provided. The paper describes the methodology but does not include runnable scripts or a README with commands." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "No confidence intervals or error bars are reported. Figures show point estimates connected by scaling curves without uncertainty bands." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "No statistical significance tests are performed. Claims about one method outperforming another are based on visual comparison of scaling curves without any formal testing." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "The paper reports Brier Score Δ and Exact Match Δ relative to the full replay baseline (Equation 2), providing both the magnitude and direction of improvements in context. Figures 5 and 6 show these deltas across compute scales." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "No justification is given for why 6 annealing durations (1k-36k steps) were chosen, nor why 2 seeds for baseline and 1 seed for other methods is sufficient." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "The paper states 'We run two seeds and average the results for the full replay baseline, and only use a single seed for other baselines to minimize compute cost' (Section 3.3). No standard deviations or variance measures are reported." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Full replay (continued training on pretraining data without domain-specific upsampling) serves as the baseline. Multiple data sourcing methods are compared against it." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "Baselines include WRAP (Maini et al., 2024), TinyGSM-MIND (OLMo et al., 2024), instruction augmentation (Cheng et al., 2024), and MBF following recent practices. All are contemporary methods." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Figure 3 ablates WRAP variants: WRAP with/without MMLU-style Q/A and WRAP+Q/A on unrelated Wikipedia documents, isolating the effect of formatting from domain knowledge. MBF threshold ablation is also reported (Appendix D.1)." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "The paper uses Brier Score (primary) and Exact Match across multiple evaluation formats: continuation (CF), multiple choice (MC), and generative, as detailed in Table 1." 93 }, 94 "human_evaluation": { 95 "applies": false, 96 "answer": false, 97 "justification": "Human evaluation is irrelevant to this paper's claims about data source scaling behavior for model training." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "Evaluation is conducted on standard MMLU benchmark tasks, GSM8k, and Hendrycks MATH — established test sets separate from training data." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Results are broken down by domain (medical vs math), evaluation format (CF, MC, generative), and individual data sourcing method. Table 1 lists all tasks by category." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper discusses WRAP's diminishing returns at scale due to low diversity (Section 4.1), and instruction augmentation's failure to outperform full replay on CF tasks (Section 4.1)." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "Instruction augmentation does not outperform full replay on CF tasks (Section 4.1). TinyGSM shows poor scaling (R²=0.1) on math domain. WRAP's utility degrades at scale." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract claims that rankings are not invariant across scales (demonstrated in Fig. 1), scaling curves can be constructed per source (Figs. 5-6), and validates on 7B model with medical and math domains. All are supported by experimental results." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "The main causal claim is from ablation studies: Fig. 3 shows controlled removal of MMLU-style Q/A from WRAP, demonstrating the formatting effect. The diversity hypothesis for WRAP's poor scaling is appropriately framed as a hypothesis ('We hypothesize,' Section 4.1)." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": true, 134 "justification": "The abstract and Section 3 explicitly state validation is on 'a pre-trained model with 7 billion parameters' with two specific domains. Section 4.3 acknowledges inability to verify generalization to larger scales." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": true, 139 "justification": "The paper discusses diversity as an alternative explanation for scaling differences (Section 4.1, Figs. 4 and 15), and format sensitivity as an alternative explanation for apparent method effectiveness (Section 3.2, Fig. 3)." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper explicitly defines the utility function U(D) (Equation 2) and discusses why Brier Score is preferred over accuracy as a metric, citing Schaeffer et al. (2023) on how metric choice affects emergent behavior detection." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": true, 151 "justification": "The paper specifies Meta-Llama-3-70B-Instruct for annotation, Meta-Llama-3.2-3B-Instruct for WRAP, Qwen2.5-7B-Instruct for MIND rephrasing. Their own base model architecture is described as Mistral-7b-based with full training details in Appendix C.1." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "Full prompt text is provided in Figures 7-12: 5-point scoring prompts for math and medical MBF (Figs. 7-8), and WRAP style prompts for scholar, Q/A, MMLU-style Q/A, and Wikipedia formats (Figs. 9-12)." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Detailed hyperparameters in Section 3.3 and Appendix C.1: learning rate 3e-4, AdamW with β1=0.9, β2=0.95, batch size 256, sequence length 8192, 10% upsampling ratio, annealing steps 1k-36k, MBF threshold 2.5." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. The paper trains and evaluates language models directly." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "MBF filtering pipeline is described: 500k examples annotated by Llama-3-70B, BERT regressor trained, threshold of 2.5 selected via ablation (Appendix D.1). WRAP generation process is detailed in Appendix D.2 with seed selection criteria." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 4.3 'Limitations' provides substantive discussion of the approach's constraints." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "Section 4.3 discusses specific threats: batch sampling stochasticity at small compute, WRAP's variability distorting scaling coefficients, inability to ablate upsampling ratio and checkpoint sensitivity, and unknown generalization to larger compute scales." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": true, 188 "justification": "Section 4.3 explicitly states what was not tested: sensitivity to upsampling ratio, impact of starting checkpoint/learning rate, generalization to scales 'orders of magnitude larger,' and ablations across tunings of a single data source." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": false, 195 "justification": "No raw experimental data (per-run metrics, individual evaluation scores) is released. Only aggregated results in figures are available." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Data collection is described in detail: MBF uses BERT regressor trained on 500k Llama-3-70B annotations (Appendix D.1), WRAP uses 1M highly-scored MBF documents as seeds (Appendix D.2), TinyGSM generation process is cited from Liu et al. (2023)." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants. Data sources are standard public datasets (FineWeb-Edu, Dolma) and synthetic generation." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The pipeline from raw data to results is documented: pretraining mix → checkpoint selection → MBF/WRAP/synthetic data generation → annealing with 10% upsampling → evaluation on benchmarks. Appendices D and E provide additional detail." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding sources or acknowledgments section is present in the paper." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations are clearly listed: ServiceNow Research, Mila — Quebec AI Institute, and Reka AI. Two authors are noted as having done work during internship at ServiceNow." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "ServiceNow Research is the primary affiliation and likely funded the compute. As the employer of most authors, ServiceNow has a potential interest in the framework's success. No explicit funding independence statement is provided." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests statement or financial disclosures are present in the paper." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": true, 238 "answer": false, 239 "justification": "The paper does not state a training data cutoff date. They describe training on FineWeb-Edu and Dolma but do not specify the temporal coverage of these datasets." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": true, 243 "answer": false, 244 "justification": "No discussion of whether MMLU, GSM8k, or Hendrycks MATH test examples appear in FineWeb-Edu or Dolma training data." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": true, 248 "answer": false, 249 "justification": "MMLU (2020), GSM8k (2021), and Hendrycks MATH are well-established benchmarks likely present in web-crawled training data. The paper does not address this contamination risk." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in this study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": true, 293 "justification": "The paper provides detailed FLOPs cost analysis for each data sourcing method (Appendix E), including back-of-envelope estimates ($500K-$1M for 100B synthetic tokens). Figures 5 plot performance vs. compute cost." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": true, 298 "justification": "Base model training: 32,500 H100-hours on 64 H100 GPUs (Appendix C.1). Annealing runs: 2.1B to 75B tokens. Curation costs estimated in FLOPs (Appendix E)." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "Only 2 seeds for the full replay baseline, and a single seed for all other methods (Section 3.3). No seed sensitivity analysis is performed." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": true, 310 "justification": "Section 3.3 explicitly states: 'We run two seeds and average the results for the full replay baseline, and only use a single seed for other baselines.'" 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "The 10% upsampling ratio was 'selected based on a hyperparameter search conducted on MBF data in the medical domain' but the number of configurations tried and compute spent is not reported. MBF threshold sweep (2 to 5 in 0.5 increments) is mentioned but total search cost is not." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": true, 320 "justification": "MBF threshold of 2.5 was selected from a parameter sweep (2 to 5 in 0.5 increments, Appendix D.1) based on downstream performance. The 10% upsampling ratio was selected via search on medical domain MBF data." 321 }, 322 "multiple_comparison_correction": { 323 "applies": false, 324 "answer": false, 325 "justification": "No statistical hypothesis tests are performed, so multiple comparison correction is not applicable." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors implement and evaluate all data sourcing methods themselves. No discussion of potential bias from self-evaluation or comparison with independent implementations." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": true, 335 "justification": "This is the core contribution. Figures 5 and 6 plot performance (Brier Score Δ, Exact Match Δ) as a function of compute budget (FLOPs) for each data sourcing method." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": true, 340 "justification": "Section 3.2 and Fig. 3 explicitly discuss construct validity: CF format is more robust than MC format for measuring knowledge (MC is sensitive to formatting artifacts). The paper advocates Brier Score over accuracy citing Schaeffer et al.'s analysis of emergent abilities as metric artifacts." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No scaffolding is involved. The paper evaluates data sources through direct model training and standard benchmark evaluation." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "MMLU (2020), GSM8k (2021), and Hendrycks MATH were published before the model's training data was likely collected. No discussion of temporal leakage." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether the evaluation setup (particularly WRAP's MMLU-style Q/A augmentation) leaks information about the test format or content." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "No discussion of whether training data (FineWeb-Edu, Dolma) and benchmark test data share structural similarities or content overlap." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No leakage detection or prevention method is applied. No decontamination, canary strings, or membership inference tests." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "Data source rankings are not invariant across compute scales — WRAP outperforms MBF at small compute but this reverses at larger scales in the medical domain.", 374 "evidence": "Figure 1 and Figure 5 (top) show WRAP outperforming MBF at low token counts but MBF overtaking at higher compute budgets on MMLU Medical CF tasks.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Scaling curves can be estimated per data source to predict utility at larger scales, enabling cost-effective resource allocation.", 379 "evidence": "Figure 5 shows fitted scaling laws with R² values: MBF R²=0.8, WRAP R²=0.9 (medical curation-only). The extrapolated curves correctly predict relative rankings at the highest tested scale.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "WRAP's poor scaling is due to low diversity compared to MBF.", 384 "evidence": "Figure 4 shows MBF has higher distinct n-gram scores across all n-gram levels; Figure 15 shows higher entropy for MBF. The authors frame this as a hypothesis (Section 4.1).", 385 "supported": "weak" 386 }, 387 { 388 "claim": "Instruction augmentation improves MC-format evaluation but not CF-format, suggesting it enhances formatting rather than underlying knowledge.", 389 "evidence": "Figure 5 shows instruction augmentation does not outperform full replay on CF tasks. Figure 13b shows it matches WRAP on MC tasks. Figure 3 demonstrates format sensitivity.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "CF evaluation format is more robust and format-invariant than MC or generative formats.", 394 "evidence": "Figure 3 shows CF performance remains consistent across WRAP variants (with/without MMLU Q/A, Wiki Q/A), while MC and generative formats vary dramatically with formatting changes.", 395 "supported": "strong" 396 } 397 ], 398 "methodology_tags": ["benchmark-eval"], 399 "key_findings": "Data source rankings for domain-specific model annealing are not invariant across compute scales, making single-point estimates (micro-annealing) potentially misleading. The paper demonstrates this with a 7B parameter model across medical and math domains, showing that WRAP synthetic data outperforms model-based filtering at small compute but the relationship reverses at scale, likely due to diversity limitations. The CF evaluation format is found to be more robust than MC format for assessing domain knowledge, as MC is highly sensitive to formatting artifacts. Cost-aware scaling curves can be constructed per data source to inform resource allocation decisions.", 400 "red_flags": [ 401 { 402 "flag": "Single-seed experiments", 403 "detail": "All non-baseline experiments use a single random seed. The authors acknowledge stochasticity at small scales can distort scaling law coefficients, yet they do not mitigate this with multiple seeds. This undermines the reliability of the fitted scaling curves, particularly for methods like WRAP that 'show greater variability at low scales.'" 404 }, 405 { 406 "flag": "No contamination analysis", 407 "detail": "The paper evaluates on MMLU, GSM8k, and Hendrycks MATH — all well-established benchmarks likely present in web-crawled training data (FineWeb-Edu, Dolma). No decontamination or overlap analysis is performed. Benchmark contamination could differentially affect data sourcing methods (e.g., MBF may preferentially select documents containing benchmark-adjacent content)." 408 }, 409 { 410 "flag": "Potential MMLU leakage through WRAP", 411 "detail": "WRAP+MMLU-style Q/A explicitly generates training examples in MMLU format. This could leak test format or content into training, inflating MC evaluation scores. While the paper acknowledges this for MC, the potential contamination of CF evaluation is not discussed." 412 }, 413 { 414 "flag": "Company evaluating own framework", 415 "detail": "ServiceNow Research employees evaluate a framework that enhances their training pipeline. The Fast-LLM training engine (also from ServiceNow) is used throughout. No independent evaluation or conflict acknowledgment." 416 } 417 ], 418 "cited_papers": [ 419 { 420 "title": "Scaling laws for neural language models", 421 "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"], 422 "year": 2020, 423 "arxiv_id": "2001.08361", 424 "relevance": "Foundational scaling laws work that this paper extends to data source utility estimation." 425 }, 426 { 427 "title": "The Llama 3 herd of models", 428 "authors": ["Aaron Grattafiori"], 429 "year": 2024, 430 "arxiv_id": "2407.21783", 431 "relevance": "Introduces micro-annealing for data evaluation, the baseline approach this paper extends with scaling laws." 432 }, 433 { 434 "title": "Rephrasing the web: A recipe for compute and data-efficient language modeling", 435 "authors": ["Pratyush Maini", "Skyler Seto"], 436 "year": 2024, 437 "arxiv_id": "2401.16380", 438 "relevance": "Proposes WRAP method for synthetic data augmentation, one of the main data sourcing methods evaluated." 439 }, 440 { 441 "title": "2 OLMo 2 Furious", 442 "authors": ["Team OLMo", "Pete Walsh", "Luca Soldaini"], 443 "year": 2024, 444 "arxiv_id": "2501.00656", 445 "relevance": "Uses micro-annealing for data quality validation and introduces TinyGSM-MIND variant evaluated in this paper." 446 }, 447 { 448 "title": "Scaling laws for data filtering — data curation cannot be compute agnostic", 449 "authors": ["Sachin Goyal", "Pratyush Maini"], 450 "year": 2024, 451 "relevance": "Demonstrates that high-quality filtered data loses utility when repeated, directly motivating scaling-aware data source evaluation." 452 }, 453 { 454 "title": "RegMix: Data mixture as regression for language model pre-training", 455 "authors": ["Qian Liu", "Xiaosen Zheng"], 456 "year": 2024, 457 "arxiv_id": "2407.01492", 458 "relevance": "Alternative approach to data mixture optimization using regression on small proxy models, compared conceptually to this work." 459 }, 460 { 461 "title": "Data mixing laws: Optimizing data mixtures by predicting language modeling performance", 462 "authors": ["Jiasheng Ye"], 463 "year": 2024, 464 "arxiv_id": "2403.16952", 465 "relevance": "Proposes functional relationships for predicting performance across data mixtures, complementary approach to per-source scaling laws." 466 }, 467 { 468 "title": "Are emergent abilities of large language models a mirage?", 469 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 470 "year": 2023, 471 "relevance": "Motivates the use of Brier Score over accuracy by showing emergent abilities can be metric artifacts." 472 }, 473 { 474 "title": "TinyGSM: achieving >80% on GSM8k with small language models", 475 "authors": ["Bingbin Liu", "Sebastien Bubeck"], 476 "year": 2023, 477 "arxiv_id": "2312.09241", 478 "relevance": "Proposes synthetic math data generation method evaluated as a data source in this paper." 479 }, 480 { 481 "title": "DeepSeekMath: Pushing the limits of mathematical reasoning in open language models", 482 "authors": ["Zhihong Shao", "Peiyi Wang"], 483 "year": 2024, 484 "arxiv_id": "2402.03300", 485 "relevance": "Demonstrates importance of large-scale synthetic data for math reasoning, motivating the need for scaling-aware data source evaluation." 486 } 487 ] 488 }