scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24076B)
      1 {
      2   "paper": {
      3     "title": "Neural Neural Scaling Laws",
      4     "authors": ["Michael Y. Hu", "Jane Pan", "Ayush Rajesh Jhaveri", "Nicholas Lourie", "Kyunghyun Cho"],
      5     "year": 2026,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2601.19831",
      8     "doi": "10.48550/arXiv.2601.19831"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "NEUNEU, a neural network trained on open-source model checkpoints, achieves 2.04% MAE in predicting downstream task accuracy across 66 tasks — a 38% reduction over logistic scaling laws (3.29% MAE). The model generalizes zero-shot to unseen model families, parameter counts, and downstream tasks. Token-level validation loss distributions carry predictive signal that averaging into a single loss value obscures. NEUNEU also achieves 75.6% ranking accuracy for predicting which model configuration will perform better, a 12.3% improvement over logistic baselines.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper provides a GitHub link: https://github.com/michahu/neuneu (stated at end of §1)."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper states it is trained entirely on open-source model checkpoints from HuggingFace (DataDecide model suite, Magnusson et al., 2025), which are publicly available. Validation data uses WebOrganizer dataset (Wettig et al., 2025)."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is mentioned in the paper."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper itself. The GitHub repo is linked but the paper does not describe reproduction steps."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The paper reports ±2σ over 5 random seeds for neural models (§3) and 95% bootstrap confidence intervals for ranking accuracy (Figure 7)."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims NEUNEU outperforms baselines but does not report statistical significance tests (p-values, t-tests, etc.). Comparisons are based on MAE differences without formal tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports relative improvements with baseline context: '38% reduction compared to logistic scaling laws (3.29% MAE)' achieving 2.04% MAE, and '12.3% improvement' in ranking accuracy (0.756 vs 0.633)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is given for why 6 model sizes from DataDecide were used, or why 66 downstream tasks were chosen. No power analysis is discussed."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper reports ±2σ over 5 random seeds for neural models (§3). The logistic baseline has only numerical imprecision as randomness."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Multiple baselines are included: LOGISTIC (logistic scaling laws), LC-PFN (meta-learning method from Adriaensen et al., 2023), DIFFPROBE, NOLOSS, AVERAGE, and HISTDIFF ablations (§2.4)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "LC-PFN (2023) and logistic scaling laws from Magnusson et al. (2025) and Gadre et al. (2025) are recent and represent the standard approach."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple ablations are provided: NOLOSS (no validation loss), AVERAGE (average loss only), HISTDIFF (histogram deltas), and DIFFPROBE (MLP on histograms). These isolate the contribution of token-level losses, temporal context, and architecture (§3, Table 3b)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports MAE for prediction accuracy, ranking accuracy for model selection (Figure 7), calibration of uncertainty estimates (Figure 6C), and per-task breakdowns (Table 3b)."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant for a scaling law prediction model — the outputs are numerical accuracy predictions that can be compared directly against ground truth."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Clear held-out evaluation: new random seeds, held-out pretraining datasets (C4), unseen model family (Pythia), and 13 withheld downstream tasks (§2.4)."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 3b provides per-task MAE breakdowns for 10 OLMES tasks. Figure 3a shows performance across different generalization conditions (seeds, data, Pythia, unseen tasks)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The paper does not discuss specific failure cases or tasks where NEUNEU performs poorly. Figures 10-12 show all task visualizations but there is no error analysis."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Figure 8 shows that using token losses directly (instead of probabilities) produces worse results — a negative finding that informed the design. The AVERAGE ablation shows that average loss is worse than token-level, which is a constructive negative result."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims (2.04% MAE, 38% reduction, zero-shot generalization) are all supported by results in §3, Figure 3a, and Figure 5."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims through ablations ('token-level losses carry predictive signal that averaging obscures'). The ablation design (NEUNEU vs AVERAGE vs NOLOSS) is controlled single-variable manipulation, which is adequate for these claims."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "§5.2 Limitations explicitly bounds generalization: same validation set requirement, classification tasks only (not generative), specific model families tested. The title 'Neural Neural Scaling Laws' is descriptive of the method rather than overclaiming."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not substantively discuss alternative explanations for why NEUNEU outperforms baselines beyond the hypothesis that distributional information helps. No discussion of confounds like NEUNEU having more parameters or being trained on more data than LC-PFN."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper's claims match its measurements: MAE on accuracy predictions, ranking accuracy. It does not overframe these as broader claims about 'understanding scaling' — it specifically discusses prediction accuracy."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific model sizes are listed: {90M, 150M, 300M, 530M, 750M, 1B} from DataDecide, and {70M, 1.4B, 2.8B, 6.9B} from Pythia. NEUNEU itself is ~20M parameters (§2.4)."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use prompting — NEUNEU is a neural network trained from scratch, and the evaluated language models are evaluated on downstream tasks using standard evaluation suites."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Table 1 provides comprehensive hyperparameters: hidden dimension 512, 6 transformer layers, 8 attention heads, batch size 256, learning rate 6×10⁻⁴, weight decay 0.033, 3 epochs, warmup ratio 0.1, CNN architecture details, etc."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. NEUNEU is a standard neural network with a forward pass."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "§2.3 details the data pipeline: how training samples are constructed from checkpoint accuracies, random dropping with p=0.4, gap absorption, token probability computation from 256,000 contiguous tokens, whitespace tokenization with subword combination."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "§5.2 'Limitations and Future Work' discusses several specific limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "§5.2 discusses specific threats: same validation set requirement, classification-only tasks (generative tasks may differ), and the need to interpret what the CNN learns. These are specific to this study."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "§5.2 explicitly states scope boundaries: 'the downstream tasks in our evaluation suite are classification tasks where accuracy is the natural metric, and generative tasks may exhibit different scaling dynamics' and 'NEUNEU is designed to use the same validation set.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "All training data is from open-source HuggingFace checkpoints (DataDecide model suite). The paper emphasizes 'anyone can train our model from open-source data' (§2.3)."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "§2.3 describes exactly how training data is constructed: which model checkpoints, which seeds, how samples are created from trajectories, what validation data is used (256,000 tokens from WebOrganizer)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data source is standard public model checkpoints."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "§2.3 documents the full pipeline: checkpoint accuracies → gap imputation → random dropping → subsequence construction → target generation. Equations 14-16 formalize each step."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgments section lists NSF Graduate Research Fellowship, NYU HPC, IITP/MSIT grant, Samsung Advanced Institute of Technology, and NSF Award 1922658."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are affiliated with New York University (stated in header). The paper evaluates open-source models, not products from the authors' institution."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funders (NSF, IITP/MSIT, Samsung, NYU) have no direct stake in whether NEUNEU outperforms logistic scaling laws."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The language models being evaluated (DataDecide, Pythia) have known training data, but the paper does not state training cutoff dates for these models relative to the OLMES benchmark tasks."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether the downstream evaluation tasks (OLMES suite) appeared in the pretraining data of the DataDecide or Pythia models."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The OLMES tasks include well-known benchmarks (ARC, MMLU, HellaSwag) that could have appeared in pretraining data. This is not discussed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "§2.4 states 'at inference time, NEUNEU is not significantly more expensive to run than logistic scaling laws, taking only a few seconds of wall clock time on CPU for one downstream task.'"
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total training compute budget is stated for training NEUNEU itself. NYU HPC is acknowledged but GPU hours or training time are not reported."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "§3 states 'we also report ±2σ over 5 random seeds as an error bar' for neural models."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "§3 explicitly states 5 random seeds for neural models."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No discussion of how hyperparameters in Table 1 were selected or how many configurations were tried."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper does not explain how the final hyperparameter configuration was selected from potential alternatives."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper compares 7 methods across 66 tasks without any multiple comparison correction. No formal statistical tests are applied at all."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement all methods including baselines (LOGISTIC, LC-PFN) without acknowledging potential bias from implementing competitors' methods."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "NEUNEU has ~20M parameters and is trained for 3 epochs while logistic scaling laws are simple curve fits. The compute difference is not discussed or controlled for."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper does not discuss whether MAE on accuracy prediction is the right metric for evaluating scaling law quality, or whether the OLMES task suite is representative."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved — all methods are evaluated on the same prediction task with the same inputs."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether the OLMES benchmark tasks (ARC, MMLU, etc.) were in the pretraining data of the models whose trajectories NEUNEU learns from."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether NEUNEU's training on certain model families leaks information about the evaluation setup."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The training data (DataDecide models) and test data (Pythia, held-out seeds/datasets) share the same downstream evaluation tasks. This structural similarity is not discussed."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection or prevention method is applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "NEUNEU achieves 2.04% MAE on 66 downstream tasks, a 38% reduction compared to logistic scaling laws (3.29% MAE).",
    365       "evidence": "Figure 3a and Table 3b show MAE comparisons across all evaluation conditions and per-task breakdowns.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "NEUNEU generalizes zero-shot to unseen model families, parameter counts, and downstream tasks.",
    370       "evidence": "Figure 3a shows Pythia generalization; Figure 5 shows performance on 13 unseen tasks is lower MAE than logistic scaling laws achieve on seen tasks.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Token-level validation losses carry predictive signal that averaging obscures.",
    375       "evidence": "Ablation comparing NEUNEU vs AVERAGE vs NOLOSS in Figure 3a and Table 3b, plus HISTDIFF showing distributional features help.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "NEUNEU's uncertainty estimates are well-calibrated, with 74.9% of ground truth landing in the 10%-90% interquantile range (expected 80%).",
    380       "evidence": "Figure 6C shows calibration analysis.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "NEUNEU achieves 75.6% ranking accuracy for predicting which model configuration achieves better final performance, a 12.3% improvement over logistic baselines (63.3%).",
    385       "evidence": "Figure 7 with 95% bootstrap confidence intervals.",
    386       "supported": "strong"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "No contamination analysis for underlying benchmarks",
    392       "detail": "The OLMES tasks (ARC, MMLU, HellaSwag) are well-known benchmarks that may be contaminated in the pretraining data of the models being predicted. If the models' benchmark scores are inflated by contamination, NEUNEU is learning to predict contaminated scores."
    393     },
    394     {
    395       "flag": "Advantageous baseline setup for LOGISTIC",
    396       "detail": "The paper gives LOGISTIC ground truth future validation loss (ℓ_{t+K}) to be maximally fair, but this also means LOGISTIC is solving a fundamentally different (easier in one sense) problem. The comparison is somewhat apples-to-oranges."
    397     },
    398     {
    399       "flag": "No hyperparameter search budget reported",
    400       "detail": "The 20M parameter NEUNEU model has many design choices (CNN architecture, transformer size, drop probability, etc.) but no discussion of how these were selected or how many configurations were tried."
    401     }
    402   ],
    403   "cited_papers": [
    404     {
    405       "title": "Scaling laws for neural language models",
    406       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    407       "year": 2020,
    408       "arxiv_id": "2001.08361",
    409       "relevance": "Foundational scaling laws paper that NEUNEU builds upon and aims to improve."
    410     },
    411     {
    412       "title": "An empirical analysis of compute-optimal large language model training",
    413       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"],
    414       "year": 2022,
    415       "relevance": "Chinchilla scaling laws paper that established the dominant power-law functional form NEUNEU replaces."
    416     },
    417     {
    418       "title": "Inverse scaling: When bigger isn't better",
    419       "authors": ["Ian R. McKenzie"],
    420       "year": 2023,
    421       "relevance": "Documents inverse scaling phenomenon that motivates the need for more expressive scaling law predictors."
    422     },
    423     {
    424       "title": "Emergent abilities of large language models",
    425       "authors": ["Jason Wei"],
    426       "year": 2022,
    427       "relevance": "Documents emergent capabilities in LLMs that are hard to predict with simple scaling laws."
    428     },
    429     {
    430       "title": "Are emergent abilities of large language models a mirage?",
    431       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    432       "year": 2023,
    433       "relevance": "Challenges the emergence narrative and shows metric choice affects apparent emergence — directly relevant to scaling law prediction."
    434     },
    435     {
    436       "title": "Scaling laws are unreliable for downstream tasks: A reality check",
    437       "authors": ["Nicholas Lourie", "Michael Y. Hu", "Kyunghyun Cho"],
    438       "year": 2025,
    439       "relevance": "Companion paper by same group showing parametric scaling laws are unreliable for downstream tasks."
    440     },
    441     {
    442       "title": "DataDecide: How to predict best pretraining data with small experiments",
    443       "authors": ["Ian Magnusson"],
    444       "year": 2025,
    445       "relevance": "Provides the training data (model checkpoints) used to train NEUNEU and the logistic scaling law baseline."
    446     },
    447     {
    448       "title": "Pythia: a suite for analyzing large language models across training and scaling",
    449       "authors": ["Stella Biderman"],
    450       "year": 2023,
    451       "relevance": "Provides the held-out model family used to test NEUNEU's zero-shot generalization."
    452     },
    453     {
    454       "title": "Broken neural scaling laws",
    455       "authors": ["Ethan Caballero", "Kshitij Gupta", "Irina Rish", "David Krueger"],
    456       "year": 2023,
    457       "relevance": "Proposes expressive parametric forms for scaling laws — a direct baseline approach that NEUNEU aims to improve upon."
    458     },
    459     {
    460       "title": "Efficient bayesian learning curve extrapolation using prior-data fitted networks",
    461       "authors": ["Steven Adriaensen"],
    462       "year": 2023,
    463       "relevance": "LC-PFN method used as a baseline — transformer-based meta-learning for learning curve prediction."
    464     }
    465   ]
    466 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs