scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19975B)
      1 {
      2   "paper": {
      3     "title": "Evidence of Phase Transitions in Small Transformer-Based Language Models",
      4     "authors": ["Noah Hong", "Tao Hong"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2511.12768",
      8     "doi": "10.48550/arXiv.2511.12768"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["observational", "case-study"],
     13   "key_findings": "A 3.6M-parameter GPT-style transformer trained on Tiny Shakespeare exhibits phase-transition-like reorganization around epochs 230–250, detected via Poisson-centered diagnostics (dispersion, KL divergence) alongside vocabulary composition and word length metrics. These transitions are invisible in standard loss curves but visible in continuous statistical probes. The study demonstrates that emergent reorganizations are observable in small models, detectable in linear training space without log-scaling, and occur early in training.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses the publicly available Tiny Shakespeare corpus (~1.1M character tokens), a standard public dataset."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, dependency files, or hardware details are provided. The model architecture is described but no software stack or library versions are mentioned."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methods section describes the setup conceptually but not enough to reproduce without guessing implementation details."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section III.F states 'shaded error bands (where plotted) denoting ± one standard deviation' and figures show mean ± s.d. across 5 seeds."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims discontinuities and phase transitions but uses no formal statistical tests (no p-values, no change-point detection tests) to confirm that the observed cusps are statistically significant rather than visual impressions."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports concrete magnitudes: word length jumps from ~1.5 to ~2.5 characters, dispersion values shift from D≈1 to D<1, and specific epoch ranges (230–250) for transitions."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper uses 5 seeds and 30,000 tokens per checkpoint but provides no justification for why these numbers are sufficient. No power analysis or discussion of whether 5 seeds is adequate."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Results are averaged across 5 independent seeds with standard deviation bands shown on figures, as stated in Section III.F."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The Poisson distribution serves as the statistical baseline throughout. The paper compares empirical distributions against fitted Poisson baselines (Figs. 1–3, KL divergence analysis)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No comparison with other methods for detecting phase transitions or training dynamics (e.g., loss landscape analysis, representation similarity analysis, or other recently proposed probes)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No ablation study is performed. The paper does not vary architecture size, dataset, tokenization, or other components to test which factors drive the observed transition."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are used: index of dispersion, KL divergence, average word length, unique vocabulary counts, and word frequency snapshots."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant to the claims about statistical signatures of phase transitions in training dynamics."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "This is not a benchmark evaluation paper. The study analyzes generated text during training, not performance on a test set."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by correct vs. incorrect words, and shown across multiple checkpoint stages (steps 0, 150, 250, 300, 350, 500, 599)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No discussion of cases where the transition was less clear, seeds that behaved differently, or conditions under which the diagnostics might fail."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that standard loss/validation curves do NOT reveal the transition, which is a negative result about conventional metrics. Section IV.A: 'not visible in the smooth training/validation losses.'"
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The three claims in the abstract (transitions in small models, detectable in linear space, occurring early in training) are all supported by the results in Sections IV and V."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper uses causal language ('driven by the abrupt emergence of longer, coherent words', 'fragment proliferation precedes word consolidation') but the study design is observational — it observes correlations between metrics, not causal mechanisms. The claims about 'barrier-crossing dynamics' and 'first-order phase transitions' are interpretive analogies, not causally demonstrated."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section V.F explicitly states limitations: single architecture, single dataset, character-level tokenization, and that 'generalization to larger models, multilingual corpora, or instruction-tuned datasets remains untested.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section II.E discusses the Schaeffer et al. critique that emergent abilities may be metric artifacts, and Section V.E explains how their methodology addresses this concern using continuous metrics rather than binary thresholds."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures word-level statistics (dispersion, KL divergence, vocabulary counts) and frames these as evidence of 'phase transitions' and 'emergent abilities,' but does not discuss whether these vocabulary-level probes are adequate proxies for the broader claims about phase transitions in neural learning. The gap between character-level word formation and genuine emergent abilities is not acknowledged."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The model is custom-built and fully specified: embedding dimension 192, 8 transformer layers, 6 attention heads, ~3.6M parameters, context length 128 characters."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use prompting. The model is trained from scratch with next-character prediction."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Only decoding parameters are stated (temperature T=1.0, greedy top-1). Training hyperparameters (learning rate, optimizer, batch size, weight decay) are not reported."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. This is a standard model training study."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section III.B describes the text segmentation procedure (whitespace and punctuation boundaries) and correct/incorrect classification (corpus vocabulary membership). Section III.D describes the windowing procedure (W=21 words)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section V.F 'Limitations and Scope' provides a dedicated subsection with six specific limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section V.F lists specific threats: single architecture and dataset, character-level tokenization differs from BPE, correct/incorrect heuristic may misclassify rare valid words, external vs. internal metrics gap, alternative decoding methods might shift results."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section V.F explicitly states what was NOT tested: 'generalization to larger models, multilingual corpora, or instruction-tuned datasets remains untested' and 'we have not yet examined universality across model sizes, datasets, or architectural families.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No generated text samples, computed statistics, or intermediate data are released for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section III.A describes the data source (Tiny Shakespeare, ~1.1M character tokens, 65 unique characters) and sampling procedure (30,000 tokens per checkpoint, 5 seeds, 0–600 epochs)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data is a standard public corpus."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from text generation → segmentation → correct/incorrect labeling → windowing → dispersion/KL computation is described across Sections III.B–III.E."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding disclosure, acknowledgments section, or mention of financial support anywhere in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: Noah Hong at Lynbrook High School, Tao Hong at Keysight Technologies."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding statement is not the same as confirming no conflicts."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial disclosure statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper trains its own model from scratch on a known corpus. It does not evaluate a pre-trained model's capability on any benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No pre-trained model is evaluated on a benchmark. The study analyzes training dynamics of a custom model."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No benchmark evaluation of a pre-trained model is performed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No training or inference costs, wall-clock time, or hardware specifications are reported despite training across 5 seeds for 600 epochs."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No GPU hours, hardware used, or total compute budget is stated."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "Phase-transition-like reorganizations occur in a small (3.6M parameter) transformer, not just in large language models.",
    296       "evidence": "Synchronized discontinuities in dispersion, KL divergence, vocabulary dynamics, and word length observed across 5 seeds at epochs 230–250 (Sections IV.A–IV.H, Figs. 1–15).",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Phase transitions can be detected directly in linear training space without log-scaling.",
    301       "evidence": "All metrics are plotted against raw epoch number with no logarithmic transformation, and discontinuities are visible (Section V.A).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Phase transitions occur early in training, around epochs 230–250, well before loss convergence.",
    306       "evidence": "Multiple metrics show synchronized cusps at epochs 230–250 in a 600-epoch training run (Section IV).",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Standard loss and validation curves do not reveal these transitions.",
    311       "evidence": "Section IV.A states transitions 'are not visible in the smooth training/validation losses' but no loss curves are shown for comparison.",
    312       "supported": "weak"
    313     },
    314     {
    315       "claim": "The transition represents a genuine phase-transition-like reorganization, not a metric artifact.",
    316       "evidence": "Multiple independent continuous metrics (dispersion, KL divergence, word length, vocabulary counts) show synchronized cusps (Section V.E). However, no formal statistical test for change-point detection is applied.",
    317       "supported": "moderate"
    318     }
    319   ],
    320   "red_flags": [
    321     {
    322       "flag": "Missing training hyperparameters",
    323       "detail": "Learning rate, optimizer, batch size, and weight decay are not reported. These are critical for reproduction and could influence the location and existence of the observed transition."
    324     },
    325     {
    326       "flag": "No formal change-point or significance tests",
    327       "detail": "The paper claims 'phase transitions' and 'discontinuities' but relies entirely on visual inspection of curves. No formal statistical test (e.g., change-point detection, permutation test) is used to confirm the cusps are statistically significant."
    328     },
    329     {
    330       "flag": "Single architecture, single dataset",
    331       "detail": "All results are from one 3.6M parameter model on Tiny Shakespeare. The claims about phase transitions being 'a general feature of language model training' are not supported by this single configuration."
    332     },
    333     {
    334       "flag": "Loss curves not shown",
    335       "detail": "The paper claims transitions are invisible in loss curves but does not actually show the loss curves, making this claim unverifiable."
    336     },
    337     {
    338       "flag": "Potential familial conflict of interest",
    339       "detail": "Both authors share the surname 'Hong' with corresponding affiliations of a high school student and an industry researcher. Reference [7] is by T. Hong (co-author) and forms a key methodological basis. This self-citation and potential familial relationship are not disclosed."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "Emergent Abilities of Large Language Models",
    345       "authors": ["J. Wei"],
    346       "year": 2022,
    347       "relevance": "Foundational work defining emergent abilities as capabilities that cannot be linearly extrapolated from smaller models, directly motivating this study."
    348     },
    349     {
    350       "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    351       "authors": ["R. Schaeffer", "B. Miranda", "S. Koyejo"],
    352       "year": 2023,
    353       "arxiv_id": "2304.15004",
    354       "relevance": "Challenges the reality of emergent abilities, arguing they may be metric artifacts; this paper's methodology is designed to address this critique."
    355     },
    356     {
    357       "title": "Grokking: Generalization Beyond Overfitting on Small Algorithmic Datasets",
    358       "authors": ["A. Power", "Y. Burda", "H. Edwards", "I. Babuschkin", "V. Misra"],
    359       "year": 2022,
    360       "relevance": "Closest analogue showing small models undergoing abrupt transitions from memorization to generalization on algorithmic tasks."
    361     },
    362     {
    363       "title": "Language Models are Few-Shot Learners",
    364       "authors": ["T. Brown"],
    365       "year": 2020,
    366       "relevance": "GPT-3 paper establishing scaling as a driver of emergent generalization in language models."
    367     },
    368     {
    369       "title": "Statistical Mechanics of Deep Learning",
    370       "authors": ["Y. Bahri", "J. Kadmon", "J. Pennington", "S. S. Schoenholz", "J. Sohl-Dickstein", "S. Ganguli"],
    371       "year": 2023,
    372       "relevance": "Comprehensive review applying statistical-mechanical tools to deep networks, providing theoretical justification for phase-transition interpretations."
    373     },
    374     {
    375       "title": "Grokking as a First Order Phase Transition in Two Layer Networks",
    376       "authors": ["N. Rubin", "I. Seroussi", "Z. Ringel"],
    377       "year": 2024,
    378       "arxiv_id": "2310.03789",
    379       "relevance": "First formal statistical-physics model of grokking as a first-order phase transition, providing theoretical framework adopted in this paper."
    380     }
    381   ]
    382 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs