scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26324B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evidence of Phase Transitions in Small Transformer-Based Language Models",
      6     "authors": [
      7       "Noah Hong",
      8       "Tao Hong"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2511.12768",
     13     "doi": "10.48550/arXiv.2511.12768"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The abstract's three central claims — transitions in small models, detectability in linear space, and early emergence — are each supported by the empirical results presented in Sections IV and V.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The paper uses causal and mechanistic language ('barrier-crossing dynamics,' 'the system must overcome nucleation barriers') but the study design is purely observational — 5 seeds of one architecture on one corpus — insufficient for causal inference.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The abstract and conclusion claim phase transitions are 'a general feature of language model training, observable at any scale,' but evidence is from a single 3.6M-parameter model on Tiny Shakespeare; the limitations section acknowledges but does not constrain the sweeping conclusion.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "The paper explicitly engages with Schaeffer et al.'s metric-artifact explanation and argues its continuous metrics avoid that critique; this constitutes genuine engagement with an alternative interpretation.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper measures vocabulary statistics (dispersion, KL divergence, word length) but frequently conflates these proxies with 'emergent linguistic abilities' and 'phase transitions' without clearly distinguishing the statistical signal from the underlying construct.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section V.F 'Limitations and Scope' is a dedicated subsection listing six specific limitations.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Specific threats listed include: single architecture and dataset, character-level vs. subword tokenization differences, external vs. internal metric focus, decoding method sensitivity, and absence of universality testing — these are specific rather than boilerplate.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The limitations section explicitly states that 'generalization to larger models, multilingual corpora, or instruction-tuned datasets remains untested,' bounding the scope of the findings.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No funding disclosure appears anywhere in the paper; whether the work is unfunded is not stated.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Author affiliations are clearly stated: Noah Hong at Lynbrook High School and Tao Hong at Keysight Technologies.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No funder identified; question is not applicable.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests or financial interests statement appears in the paper.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Key terms are operationally defined: 'correct' vs. 'incorrect' words (corpus vocabulary membership), Poisson/sub-Poisson regimes (index of dispersion), and 'phase transition' is grounded in the statistical physics literature reviewed in Section II.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section II.H explicitly enumerates three contributions: (1) phase transitions in small models, (2) detection in linear training space, (3) transitions occur early — clearly and specifically stated.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The related work section spans seven subsections covering statistical physics, grokking, emergent abilities, and critiques, explicitly positioning contributions relative to Wei et al., Schaeffer et al., Power et al., and Rubin et al.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": false,
    121           "justification": "No code repository or release is mentioned anywhere in the paper.",
    122           "source": "haiku"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "Tiny Shakespeare is a standard public corpus used unmodified; no novel dataset was created.",
    128           "source": "haiku"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "No requirements file, Dockerfile, or software environment specifications are provided; framework, Python version, and library versions are absent.",
    134           "source": "haiku"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "Architecture specs are given but optimizer, learning rate, batch size, and weight decay are not reported, making reproduction impossible without guessing.",
    140           "source": "haiku"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "Section III.F states metrics are averaged across 5 seeds with ±1 standard deviation shaded error bands shown in figures.",
    148           "source": "haiku"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "No statistical significance tests are used; synchronization of cusps across metrics is argued visually without formal hypothesis testing.",
    154           "source": "haiku"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "Effect magnitudes are reported: average word length increases from ~1.5 to ~2.5 characters; specific epoch range (230–250) for transition is identified.",
    160           "source": "haiku"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "The choice of 5 seeds, 30,000 generated tokens per checkpoint, and window size W=21 are not justified with any power analysis or sensitivity analysis.",
    166           "source": "haiku"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": true,
    171           "justification": "±1 standard deviation across 5 seeds is reported for all main metric figures.",
    172           "source": "haiku"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "The Poisson distribution serves as a mathematical baseline for dispersion, but no comparison models, alternative architectures, or alternative training procedures are tested.",
    180           "source": "haiku"
    181         },
    182         "baselines_contemporary": {
    183           "applies": false,
    184           "answer": false,
    185           "justification": "No model baselines to evaluate for contemporariness.",
    186           "source": "haiku"
    187         },
    188         "ablation_study": {
    189           "applies": true,
    190           "answer": false,
    191           "justification": "No ablation studies are performed; single architecture and window size are tested without varying components.",
    192           "source": "haiku"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Multiple independent metrics are used: index of dispersion, KL divergence, average word length, unique correct/incorrect vocabulary counts, and word frequency snapshots.",
    198           "source": "haiku"
    199         },
    200         "human_evaluation": {
    201           "applies": false,
    202           "answer": false,
    203           "justification": "Human evaluation is not relevant to this study of training dynamics statistics.",
    204           "source": "haiku"
    205         },
    206         "held_out_test_set": {
    207           "applies": false,
    208           "answer": false,
    209           "justification": "This is a study of training dynamics rather than a prediction task requiring held-out evaluation.",
    210           "source": "haiku"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Results are systematically broken down by correct vs. incorrect word categories throughout, with separate figures and analysis for each category.",
    216           "source": "haiku"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": false,
    221           "justification": "The paper does not discuss cases where the diagnostic fails, seeds that deviate from the pattern, or conditions under which no transition is detected.",
    222           "source": "haiku"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "The paper explicitly reports that transitions are 'not apparent in standard loss or validation curves,' establishing a key negative result about standard monitoring metrics.",
    228           "source": "haiku"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "The model is custom-trained and fully described: 192 embedding dim, 8 transformer layers, 6 attention heads, 128 context length, ~3.6M parameters.",
    236           "source": "haiku"
    237         },
    238         "prompts_provided": {
    239           "applies": false,
    240           "answer": false,
    241           "justification": "This is a training dynamics study; prompts are not applicable.",
    242           "source": "haiku"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": true,
    246           "answer": false,
    247           "justification": "Temperature (T=1.0) and window size (W=21) are reported, but learning rate, optimizer, batch size, and weight decay are absent.",
    248           "source": "haiku"
    249         },
    250         "scaffolding_described": {
    251           "applies": false,
    252           "answer": false,
    253           "justification": "No agentic scaffolding; the paper trains a language model from scratch.",
    254           "source": "haiku"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Segmentation procedure (whitespace and punctuation boundaries) and correctness labeling (corpus vocabulary membership) are explicitly described in Section III.B.",
    260           "source": "haiku"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": false,
    267           "justification": "No checkpoint files, generated text samples, or metric time-series data are made available.",
    268           "source": "haiku"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Generation procedure is described: 30,000 tokens sampled at each checkpoint using T=1.0 decoding from the trained model across 5 seeds.",
    274           "source": "haiku"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": false,
    278           "answer": false,
    279           "justification": "No human participants; question is not applicable.",
    280           "source": "haiku"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "The pipeline from corpus loading → model training → checkpoint generation → text segmentation → metric computation is described across Sections III.A through III.F.",
    286           "source": "haiku"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": false,
    292           "answer": false,
    293           "justification": "The paper trains its own model from scratch; there is no pre-trained model being evaluated on external benchmarks, so training cutoff is not applicable.",
    294           "source": "haiku"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": false,
    298           "answer": false,
    299           "justification": "Not applicable; correctness is evaluated against the training corpus vocabulary, with no separate benchmark for contamination to affect.",
    300           "source": "haiku"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": false,
    304           "answer": false,
    305           "justification": "No benchmark evaluation; the model is assessed through statistical properties of its own generated text.",
    306           "source": "haiku"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "No human participants.",
    314           "source": "haiku"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants.",
    320           "source": "haiku"
    321         },
    322         "demographics_reported": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants.",
    326           "source": "haiku"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants.",
    332           "source": "haiku"
    333         },
    334         "randomization_described": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants.",
    338           "source": "haiku"
    339         },
    340         "blinding_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants.",
    344           "source": "haiku"
    345         },
    346         "attrition_reported": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants.",
    350           "source": "haiku"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": false,
    356           "answer": false,
    357           "justification": "This is a research study of training dynamics, not a deployed system; inference cost is not a relevant practical consideration.",
    358           "source": "haiku"
    359         },
    360         "compute_budget_stated": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "No compute budget (GPU hours, hardware used, wall-clock time) is reported for training 5 seeds × 600 epochs.",
    364           "source": "haiku"
    365         }
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Phase-transition-like reorganizations occur in small (3.6M parameter) transformers, not only in large-scale LLMs.",
    372       "evidence": "Synchronized cusps in dispersion, KL divergence, word length, and vocabulary dynamics at epochs 230–250 across 5 seeds on Tiny Shakespeare.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "Phase transitions can be detected directly in linear training space without logarithmic rescaling of compute.",
    377       "evidence": "Dispersion and KL divergence show cusps along the raw epoch axis without log transformation, and are invisible in standard loss/validation curves.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Transitions emerge early in training (epochs 230–250) before loss convergence.",
    382       "evidence": "Standard loss/validation curves remain smooth while Poisson-based metrics show synchronized discontinuities in the same narrow epoch window.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Temporary degradation (incorrect vocabulary peak, dispersion reversion) before improvement is evidence of first-order phase transition barrier-crossing dynamics.",
    387       "evidence": "Incorrect vocabulary peaks at step 250 and correct-word dispersion temporarily reverts to D≈1 before sub-Poisson stabilization.",
    388       "supported": "weak"
    389     },
    390     {
    391       "claim": "The 'dispersion flip' — correct words from near-Poisson to sub-Poisson, incorrect words from sub-Poisson to Poisson — constitutes a measurable order parameter change.",
    392       "evidence": "Figures 14–15 show this opposing trajectory across 5 seeds with ±1 SD error bands.",
    393       "supported": "strong"
    394     }
    395   ],
    396   "methodology_tags": [
    397     "observational",
    398     "case-study"
    399   ],
    400   "key_findings": "A 3.6M-parameter character-level transformer trained on Tiny Shakespeare exhibits a coordinated, phase-transition-like reorganization at approximately epochs 230–250, invisible in standard loss curves but detectable through Poisson-based statistical probes. Correct words shift from near-Poisson to sub-Poisson dispersion (structured usage) while incorrect words shift from sub-Poisson to Poisson (sparse random errors), with average word length jumping from ~1.5 to ~2.5 characters. Multiple independent metrics — index of dispersion, KL divergence from Poisson, vocabulary dynamics, and prefix formation tracking — show synchronized cusps in the same narrow epoch window, which the authors argue constitutes converging evidence for genuine internal reorganization rather than a metric artifact. The paper concludes that phase-transition-like phenomena are observable at modest scale without logarithmic rescaling, with a temporary increase in errors preceding consolidation interpreted as barrier-crossing dynamics analogous to first-order phase transitions.",
    401   "red_flags": [
    402     {
    403       "flag": "Single model, single corpus",
    404       "detail": "All findings derive from one 3.6M-parameter architecture trained on Tiny Shakespeare (~1.1M characters); sweeping claims that transitions are 'a general feature of language model training' and 'observable at any scale' are not supported by this narrow experimental scope."
    405     },
    406     {
    407       "flag": "No statistical significance testing",
    408       "detail": "Synchronization of cusps across metrics is argued visually with error bands but without any formal tests for coincidence of transition epochs or whether cusps are statistically distinguishable from noise."
    409     },
    410     {
    411       "flag": "Missing critical hyperparameters",
    412       "detail": "Optimizer, learning rate, batch size, and weight decay are not reported, making independent reproduction impossible despite the simple experimental setup."
    413     },
    414     {
    415       "flag": "No code or raw data released",
    416       "detail": "No repository, checkpoint files, or generated text data is provided; results cannot be independently verified."
    417     },
    418     {
    419       "flag": "Causal language exceeds observational design",
    420       "detail": "Terms like 'barrier-crossing dynamics,' 'the system must overcome nucleation barriers,' and 'generalization minimum overtakes memorization minimum' are theoretical analogies from physics presented as explanations for a purely observational study with no intervention."
    421     },
    422     {
    423       "flag": "Phase transition label may be unfalsifiable as used",
    424       "detail": "The paper defines the transition by existence of cusps in its chosen metrics without estimating critical exponents, testing for hysteresis, or applying finite-size scaling — the markers of genuine statistical mechanics phase transitions that would distinguish it from any smooth threshold effect."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Emergent Abilities of Large Language Models",
    430       "relevance": "Foundational claim that capabilities emerge abruptly at scale; this paper asks whether identical dynamics occur in small models."
    431     },
    432     {
    433       "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    434       "relevance": "Key critique that emergent abilities may be metric artifacts; this paper's methodology directly responds by using continuous internal metrics."
    435     },
    436     {
    437       "title": "Grokking: Generalization Beyond Overfitting on Small Algorithmic Datasets",
    438       "relevance": "Closest small-scale analogue — abrupt reorganization in small models on algorithmic tasks; the paper extends this to linguistic tasks."
    439     },
    440     {
    441       "title": "Grokking as a First Order Phase Transition in Two Layer Networks",
    442       "relevance": "Provides formal statistical mechanics framing (effective potential with competing minima) used to interpret the observed dispersion flip."
    443     },
    444     {
    445       "title": "Statistical Mechanics of Deep Learning",
    446       "relevance": "Comprehensive review linking neural network training dynamics to phase transitions; supplies theoretical justification for the paper's framework."
    447     },
    448     {
    449       "title": "Progress Measures for Grokking via Mechanistic Interpretability",
    450       "relevance": "Mechanistic interpretability approach to tracking grokking that complements the external statistical metrics used here."
    451     }
    452   ],
    453   "engagement_factors": {
    454     "practical_relevance": {
    455       "score": 1,
    456       "justification": "The suggestion that temporary training degradation may signal impending reorganization could inform training protocols, but requires substantial further validation before practitioners could act on it."
    457     },
    458     "surprise_contrarian": {
    459       "score": 2,
    460       "justification": "Directly challenges the assumption that emergent abilities and phase transitions require billion-parameter models, arguing they are observable at 3.6M parameters with appropriate metrics."
    461     },
    462     "fear_safety": {
    463       "score": 0,
    464       "justification": "No AI safety or risk implications; purely mechanistic study of character-level training dynamics on a literary corpus."
    465     },
    466     "drama_conflict": {
    467       "score": 1,
    468       "justification": "Engages with the active Schaeffer et al. debate on whether emergent abilities are real or metric artifacts, but the engagement is scholarly rather than confrontational."
    469     },
    470     "demo_ability": {
    471       "score": 1,
    472       "justification": "Conceptually reproducible on Tiny Shakespeare with modest compute, but absence of code means replication requires significant effort."
    473     },
    474     "brand_recognition": {
    475       "score": 0,
    476       "justification": "Authors affiliated with a high school and Keysight Technologies (test equipment company), not a recognized AI research institution."
    477     }
    478   },
    479   "hn_data": {
    480     "threads": [
    481       {
    482         "hn_id": "33793174",
    483         "title": "Program Repair",
    484         "points": 25,
    485         "comments": 6,
    486         "url": "https://news.ycombinator.com/item?id=33793174",
    487         "created_at": "2022-11-29T20:56:48Z"
    488       },
    489       {
    490         "hn_id": "38422264",
    491         "title": "Prompting Frameworks for Large Language Models: A Survey",
    492         "points": 25,
    493         "comments": 4,
    494         "url": "https://news.ycombinator.com/item?id=38422264",
    495         "created_at": "2023-11-26T15:22:00Z"
    496       },
    497       {
    498         "hn_id": "46665309",
    499         "title": "Reverse Engineering the ESP32-C3 Wi-Fi Drivers for Static Worst-Case Analysis",
    500         "points": 8,
    501         "comments": 0,
    502         "url": "https://news.ycombinator.com/item?id=46665309",
    503         "created_at": "2026-01-18T06:27:12Z"
    504       },
    505       {
    506         "hn_id": "33745326",
    507         "title": "Program Repair",
    508         "points": 5,
    509         "comments": 0,
    510         "url": "https://news.ycombinator.com/item?id=33745326",
    511         "created_at": "2022-11-25T18:26:49Z"
    512       },
    513       {
    514         "hn_id": "42911811",
    515         "title": "Preserving Culinary Traditions. A Crowdsourced Digital Collection of Cookbooks",
    516         "points": 3,
    517         "comments": 0,
    518         "url": "https://news.ycombinator.com/item?id=42911811",
    519         "created_at": "2025-02-02T21:04:34Z"
    520       },
    521       {
    522         "hn_id": "38391666",
    523         "title": "Prompting Frameworks for Large Language Models: A Survey",
    524         "points": 2,
    525         "comments": 0,
    526         "url": "https://news.ycombinator.com/item?id=38391666",
    527         "created_at": "2023-11-23T11:28:55Z"
    528       },
    529       {
    530         "hn_id": "42204850",
    531         "title": "SEFD: Semantic-Enhanced Framework for Detecting LLM-Generated Text",
    532         "points": 1,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=42204850",
    535         "created_at": "2024-11-21T14:54:19Z"
    536       },
    537       {
    538         "hn_id": "38473609",
    539         "title": "AviationGPT: A Large Language Model for the Aviation Domain",
    540         "points": 1,
    541         "comments": 0,
    542         "url": "https://news.ycombinator.com/item?id=38473609",
    543         "created_at": "2023-11-30T14:00:57Z"
    544       },
    545       {
    546         "hn_id": "38388226",
    547         "title": "Prompting Frameworks for Large Language Models: A Survey",
    548         "points": 1,
    549         "comments": 0,
    550         "url": "https://news.ycombinator.com/item?id=38388226",
    551         "created_at": "2023-11-23T01:55:17Z"
    552       }
    553     ],
    554     "top_points": 25,
    555     "total_points": 71,
    556     "total_comments": 10
    557   }
    558 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs