scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27270B)
      1 {
      2   "paper": {
      3     "title": "Relative Scaling Laws for LLMs",
      4     "authors": ["William Held", "David Hall", "Percy Liang", "Diyi Yang"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2510.24626",
      8     "doi": "10.48550/arXiv.2510.24626"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "Footnote 1 states experimental code is in the Marin Github, analysis/plotting code in a separate project repository, and all 255 model checkpoints are released on HuggingFace."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper uses publicly available datasets (CommonPile, DCLM Baseline, Nemotron-CC) and releases all 255 model checkpoints on HuggingFace. Evaluation benchmarks (MMLU, ICE, Anthropic risk evals) are public."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. The paper references external code repositories but does not specify dependency versions."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "While code repositories and model checkpoints are referenced, the paper itself does not provide step-by-step reproduction instructions. The reader must navigate external repositories."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The scaling law plots show fitted curves but no confidence intervals or error bars on the individual data points or fitted parameters."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Section 2 states: 'we only interpret the slope if the sign is significant at P < 0.05 by a bootstrap significance test.' Pearson correlations in Section 4.2 include p-values (P=0.002, P=0.001)."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Effect sizes are reported throughout: percentage relative error reductions (e.g., '−29% Humanities at 10^18 FLOPs narrowing to within 5% at 10^20'), Pearson R correlations (0.82–0.84), and slopes like '0.3–0.4% relative error slope improvement per ten-fold increase in speaker population.'"
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No justification for why 85 models per dataset (255 total) or the specific FLOP range was chosen. No power analysis for detecting the relative scaling effects."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No variance or standard deviation across runs is reported. Each model appears to be a single training run. No discussion of run-to-run variance."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Traditional (absolute) scaling laws serve as the baseline framework. The paper compares relative scaling against absolute scaling throughout, and validates against held-out models (OLMo 2, Llama 3, Qwen 3) in Figure 2."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Held-out comparison models include OLMo 2, Llama 3, and Qwen 3 — all contemporary models. Prior work compared includes recent papers (Hoffmann 2022, Besiroglu 2024, Porian 2025)."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Section 4.2 and Figure 6 isolate model-size scaling vs. token scaling, showing that model-size scaling drives relative performance shifts while data scaling leaves relative performance unchanged. The paper also compares across 3 training datasets as an ablation over data distribution."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper uses loss (BPB), accuracy, R² fit quality, Pearson correlation, and risk likelihood as metrics across different case studies."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "Human evaluation is not relevant to this scaling law analysis — the claims are about statistical relationships between compute and model performance metrics."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Figure 2 explicitly validates scaling law predictions against held-out models (OLMo 2, Llama 3, Qwen 3) not used for fitting. Evaluation benchmarks (MMLU, ICE, Anthropic risk evals) are separate from training data."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down by MMLU sub-domain (STEM, Humanities, Social Sciences, Misc.), by regional English variety (USA, Canada, Nigeria, Singapore, Sri Lanka + 5 others), and by AI risk cluster (5 categories). This is the core of the paper."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper explicitly discusses cases where scaling fails: regional English varieties that diverge with scale (Sri Lanka, Nigeria), and AI risk categories where scaling has no effect (Scheming, Incorrigibility)."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Key negative results: scaling does NOT close gaps for all English varieties (divergence for Sri Lanka and Nigeria), scaling does NOT reduce all risk categories (Scheming and Incorrigibility are flat or declining). The Limitations section acknowledges theoretical grounding is missing."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims about MMLU convergence, dialect divergence by population size, and AI risk splitting are all supported by Figures 4, 5, and 7 respectively with data across three training corpora."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper uses controlled IsoFLOP training to isolate the effect of compute. Figure 6 further isolates model-size vs. data scaling as separate causes. The causal identification strategy (matched compute budgets, three independent corpora) is appropriate for the claims made."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The Limitations section explicitly states: 'The three case studies we present are necessarily selective, so they should be viewed as a first attempt rather than a full coverage of the application space.' The paper also notes the connection between loss and utility is 'not well studied.'"
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 4.2 discusses that intercepts show no correlation with prevalence (only slopes do), considers the role of training data distribution, and Section 4.3 discusses whether adversarial risks require 'additional pressures' beyond pretraining. The Limitations section discusses the gap between loss and utility."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper explicitly discusses the proxy gap: Section 3.2 establishes that loss is used as a proxy for accuracy, and the Limitations section states 'the connection between raw language modeling loss, as studied in the case study on linguistic variation, and broad utility for downstream users is not well studied.'"
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper specifies Qwen 3 architecture for training, with exact model sizes ranging by width (512–4096). Held-out models are named (OLMo 2, Llama 3, Qwen 3). All 255 models are released on HuggingFace."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Appendix D (Figure A.4) provides the exact prompt formats used for MMLU evaluation, including Default MCQ, Continuation-Form, and Modified MCQ with the actual template and scored completions."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Appendix A provides comprehensive hyperparameter details: learning rate formula, β1=0.95, β2=0.95, ε=10^-15, weight decay=0.1, gradient clipping at norm 1.0, warmup 5%, linear decay 20%, batch size formula, and attention head configuration."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. The paper trains and evaluates Transformer models directly."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 3.1 describes the three training datasets and their design philosophies. For ICE (Section 4.2), the paper specifies restriction to the written component and describes the corpus structure (~1M words per variety, 500 texts of ~2,000 words). Appendix E documents the clustering of AI risk behaviors."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 6 (Conclusion) contains a dedicated 'Limitations' paragraph discussing theoretical grounding gaps, selective case studies, and the loss-utility connection."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The Limitations section raises specific threats: 'Our analyses are primarily empirical and do not yet provide the kind of theoretical grounding suggested by prior work,' 'three case studies we present are necessarily selective,' and 'the connection between raw language modeling loss...and broad utility for downstream users is not well studied.'"
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The paper explicitly bounds scope: case studies 'should be viewed as a first attempt rather than a full coverage,' identifies what was NOT tested (targeted data augmentation, multimodal models, post-training effects), and notes the loss-utility gap."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "All 255 model checkpoints are released on HuggingFace. Experimental logs are viewable on the Marin data browser (footnote 1). Training datasets are publicly available."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 3.1 describes the three training datasets and their characteristics. Section 4.2 describes ICE corpus (collection methodology, sampling of 500 texts per variety). Appendix E documents AI risk behavior clustering from Perez et al. (2023)."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. All data sources are standard public benchmarks and corpora (MMLU, ICE, Anthropic risk evals)."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline is documented: IsoFLOP training → compute-optimal selection (Figure 2 left) → evaluation on benchmarks → scaling law fitting → relative scaling analysis. Appendix A details the training configuration pipeline."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Acknowledgements section discloses: Google TPU Research Cloud, Stanford HAI–GCP Grant (Marin Project), and Open Philanthropy funding for WH and DY."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are listed: Stanford University, OpenAthena, Georgia Institute of Technology. No authors are affiliated with the companies whose models are evaluated (OLMo/AI2, Meta/Llama, Alibaba/Qwen)."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Funding from Google TPU Research Cloud (compute), Stanford HAI, and Open Philanthropy. Open Philanthropy has interest in AI safety but the paper's findings on AI risk are mixed (some risks increase, some don't), suggesting no outcome pressure."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper trains its own models on CommonPile, DCLM Baseline, and Nemotron-CC but does not state the cutoff dates for these training datasets."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of whether MMLU questions, ICE text, or Anthropic risk evaluation questions appear in the training corpora (CommonPile, DCLM, Nemotron-CC)."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "MMLU was published in 2021 and ICE much earlier. All three training corpora include web data that could contain benchmark content. This contamination risk is not discussed."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference cost or evaluation time reported for the 255 models across the benchmarks."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "The compute budget is the core experimental variable: 255 models trained under IsoFLOP budgets from 10^18 to 10^20 FLOPs. Google TPU Research Cloud is acknowledged as the compute provider."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No mention of multiple random seeds or seed sensitivity. Each of the 255 models appears to be a single training run."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The paper does not state whether results are from single runs or averaged over multiple runs."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Section 3.1 mentions hyperparameters are set via heuristic reparameterizations from a tuned configuration (Wen et al., 2025), but the search budget for the original tuning is not reported."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "The compute-optimal model selection is explicit: for each FLOP budget, model size and token count are swept, and the compute-optimal point is selected (Figure 2, left panel). The selection criterion is clearly stated."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Many relative scaling comparisons are made across domains, regions, and risk categories, but no multiple comparison correction is applied."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors train all 255 models and evaluate them on their proposed framework. No discussion of author-evaluation bias or independent evaluation."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": true,
    324         "justification": "Performance as a function of compute budget is the entire paper — all figures plot performance metrics against FLOPs."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "Section 3.2 discusses MMLU evaluation protocol issues (prompt format effects on R², surface form competition). The Limitations section questions whether loss maps to downstream utility. Appendix C validates across multiple benchmarks."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "No scaffolding is involved — models are evaluated directly on benchmarks."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The paper does not discuss whether MMLU, ICE, or Anthropic risk evaluation content exists in the CommonPile, DCLM, or Nemotron-CC training corpora."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup leaks information through context or whether the modified MCQ prompt format introduces feature leakage."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of independence between training data and evaluation benchmarks."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No concrete leakage detection or prevention methods are applied despite training models on web-scale corpora and evaluating on public benchmarks."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Academic domains on MMLU converge toward parity as compute increases, regardless of training data distribution.",
    363       "evidence": "Figure 4 shows convergence across all three training corpora. For CommonPile, gaps narrow from −29%/−16%/−19% at 10^18 FLOPs to within 5% at 10^20 FLOPs. Similar patterns for DCLM and Nemotron.",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "Regional English varieties show mixed relative scaling trajectories correlated with online English-speaking population size.",
    368       "evidence": "Figure 5 shows Canada converging, Singapore flat, Sri Lanka and Nigeria diverging. Pearson R = 0.82–0.84 (p < 0.005) between relative scaling slope and estimated online English-speaking population across all 10 ICE corpora and 3 training datasets.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "AI risk behaviors split into two groups: capability/influence-related risks increase with scale while adversarial risks (Scheming, Incorrigibility) do not.",
    373       "evidence": "Figure 7 shows Self-Improvement, Influence, and Self-Replication scaling with compute while Scheming and Incorrigibility are flat or declining across all three training corpora.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Model-size scaling drives relative performance shifts, while data scaling leaves relative performance unchanged.",
    378       "evidence": "Figure 6 shows that scaling model size at fixed tokens produces relative shifts similar to compute-optimal scaling, but scaling tokens at fixed model size yields nearly parallel lines.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "A modified MCQ prompt format achieves both high accuracy and predictable loss scaling for MMLU.",
    383       "evidence": "Figure 3 shows modified MCQ achieves R²=0.61 and max 81.3% accuracy, compared to CF (R²=0.68, max 57.7%) and standard MCQ (R²=0.28, max 82.0%).",
    384       "supported": "strong"
    385     }
    386   ],
    387   "methodology_tags": ["benchmark-eval", "theoretical"],
    388   "key_findings": "Relative scaling laws reveal that scaling compute has non-uniform effects across subpopulations. MMLU knowledge domains converge with scale regardless of training data, but regional English varieties diverge for smaller-population dialects (Pearson R=0.82–0.84 between population size and scaling slope). AI risk behaviors split: capability-related risks increase with scale while adversarial risks (scheming, incorrigibility) do not emerge during pretraining. Model size, not data volume, drives these relative shifts.",
    389   "red_flags": [
    390     {
    391       "flag": "No contamination analysis for benchmark evaluation",
    392       "detail": "255 models are trained on web-scale corpora (CommonPile, DCLM, Nemotron-CC) and evaluated on public benchmarks (MMLU published 2021, ICE much earlier). No analysis of whether benchmark content appears in training data. Since relative scaling laws measure performance gaps, differential contamination across subdomains could confound the relative scaling conclusions."
    393     },
    394     {
    395       "flag": "Single training runs without variance reporting",
    396       "detail": "Each of the 255 models appears to be a single training run with no seed sensitivity analysis. Given the paper's claim that relative scaling slopes are meaningful at P < 0.05, the absence of run-to-run variance is a concern — the bootstrap test on slopes may not capture training stochasticity."
    397     },
    398     {
    399       "flag": "AI risk evaluation limitations",
    400       "detail": "The Anthropic AI risk evaluations (Perez et al., 2023) are model-written evaluations measuring binary yes/no tendencies in base models near 50% likelihood. The absolute risk likelihoods range from ~49% to ~53% — very close to chance. Small shifts in this regime may not reflect meaningful behavioral differences, yet the paper draws conclusions about risk trajectory."
    401     }
    402   ],
    403   "cited_papers": [
    404     {
    405       "title": "Scaling Laws for Neural Language Models",
    406       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    407       "year": 2020,
    408       "arxiv_id": "2001.08361",
    409       "relevance": "Foundational scaling law paper establishing power-law relationships between compute, data, parameters, and model performance."
    410     },
    411     {
    412       "title": "Training Compute-Optimal Large Language Models",
    413       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"],
    414       "year": 2022,
    415       "arxiv_id": "2203.15556",
    416       "relevance": "Chinchilla scaling laws establishing compute-optimal model size and data size tradeoffs."
    417     },
    418     {
    419       "title": "Inverse Scaling: When Bigger Isn't Better",
    420       "authors": ["Ian R. McKenzie"],
    421       "year": 2023,
    422       "relevance": "Documents cases where larger models perform worse, directly relevant to understanding non-uniform scaling effects."
    423     },
    424     {
    425       "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    426       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    427       "year": 2023,
    428       "arxiv_id": "2304.15004",
    429       "relevance": "Challenges emergent abilities claims as metric artifacts, directly relevant to evaluation methodology for scaling studies."
    430     },
    431     {
    432       "title": "Discovering Language Model Behaviors with Model-Written Evaluations",
    433       "authors": ["Ethan Perez"],
    434       "year": 2023,
    435       "relevance": "Source of the Anthropic AI risk evaluations used in Case Study 3 for measuring risk behavior scaling."
    436     },
    437     {
    438       "title": "Alignment Faking in Large Language Models",
    439       "authors": ["Ryan Greenblatt"],
    440       "year": 2024,
    441       "arxiv_id": "2412.14093",
    442       "relevance": "Directly relevant to AI safety evaluation — studies deceptive alignment behaviors that relate to the scheming risk cluster examined in this paper."
    443     },
    444     {
    445       "title": "Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling",
    446       "authors": ["Stella Biderman"],
    447       "year": 2023,
    448       "relevance": "Open scaling suite for scaling law evaluation, analogous resource to what this paper provides."
    449     },
    450     {
    451       "title": "Language Models Scale Reliably with Over-Training and on Downstream Tasks",
    452       "authors": ["Samir Yitzhak Gadre"],
    453       "year": 2024,
    454       "arxiv_id": "2403.08540",
    455       "relevance": "Studies downstream scaling reliability, directly relevant to understanding when scaling laws predict task performance."
    456     },
    457     {
    458       "title": "Holistic evaluation of language models",
    459       "authors": ["Percy Liang"],
    460       "year": 2023,
    461       "relevance": "HELM benchmark emphasizing multi-domain evaluation and robustness across distributions."
    462     },
    463     {
    464       "title": "Paloma: A Benchmark for Evaluating Language Model Fit",
    465       "authors": ["Ian Magnusson"],
    466       "year": 2024,
    467       "arxiv_id": "2312.10523",
    468       "relevance": "Benchmark for evaluating model fit across heterogeneous domains, directly relevant to measuring distributional robustness."
    469     },
    470     {
    471       "title": "Chinchilla Scaling: A Replication Attempt",
    472       "authors": ["Tamay Besiroglu"],
    473       "year": 2024,
    474       "arxiv_id": "2404.10102",
    475       "relevance": "Replication study of Chinchilla scaling laws highlighting reproducibility challenges in scaling research."
    476     },
    477     {
    478       "title": "Scaling Laws Are Unreliable for Downstream Tasks: A Reality Check",
    479       "authors": ["Nicholas Lourie"],
    480       "year": 2025,
    481       "arxiv_id": "2507.00885",
    482       "relevance": "Challenges reliability of scaling laws for downstream predictions, directly relevant to the evaluation methodology concerns in this paper."
    483     }
    484   ]
    485 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs