ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26871B)


      1 {
      2   "paper": {
      3     "title": "Anatomy of Capability Emergence: Scale-Invariant Representation Collapse and Top-Down Reorganization in Neural Networks",
      4     "authors": ["Jayadev Billa"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.15997"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. The paper describes detailed experimental setups but provides no code release."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The algorithmic tasks use on-the-fly generated data and the Pythia models are public, but no dataset, checkpoint data, or computed geometric measures are released. No download link or data repository is mentioned."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions using an NVIDIA RTX 3090 Ti (Appendix I) and describes model architectures, but provides no requirements.txt, Dockerfile, or library version listing sufficient to recreate the environment."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While the paper describes methods in detail (training hyperparameters in Table 8, checkpoint schedule in A.3, geometric measure formulations in Section 3.2), there are no step-by-step reproduction instructions, README, or runnable scripts provided."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Bootstrap 95% confidence intervals are reported for concordance rates in Section 4.6 (e.g., 'cross-class concordance: 93% (65/70 pairs; 95% bootstrap CI: [86%, 99%])') and Figure 5."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The swap test confidence interval excludes 50% (Section 4.6: 'CI excludes 50%, confirming this is significantly below chance'), functioning as a significance assessment. Bootstrap CIs are used throughout the concordance analyses."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are reported with context throughout: collapse floor CV values (MOD: CV=0.08), RANKME init-to-floor ratios (4-9x), Spearman correlations (rho=0.57-0.90), precursor rates (100% for hard tasks), and magnitude of delays in freeze experiments (+14,000 steps, 1.72x)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper explicitly discusses effective sample size: 'the effective number of independent observations is closer to 40 (8 tasks x 5 scales), since difficulty levels provide correlated within-task replicates' (Section 4.1). The small-N limitation of within-class analyses is also acknowledged (Section 4.6: 'The wide CI reflects the small sample size')."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Standard deviations and coefficients of variation are reported for collapse floors (e.g., MOD: 2.12 +/- 0.17, CV=0.08 in Section 4.2). LLC estimator stability is verified with CV < 0.05 across 5 runs (Appendix O). Bootstrap CIs quantify spread for concordance metrics."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares five geometric measures head-to-head (RANKME, Fisher effective rank, LLC, Hessian top-lambda, gradient covariance rank) in Table 7 and Section 4.5, providing systematic baselines for which measure best predicts emergence."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The compared measures include LLC from Lau et al. (2025), RANKME from Garrido et al. (2023), and Hessian analysis from Ghorbani et al. (2019). LLC and the developmental landscape work (Hoogland et al. 2024) are contemporary."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The freeze experiments (Section 5, Tables 14-15) serve as ablation/intervention studies, testing the causal role of specific layers during the collapse window. Additionally, robustness analyses in Appendix L vary emergence thresholds and window widths, and Appendix D tests 9 denoising strategies."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Five geometric measures are tracked (RANKME, Fisher effective rank, LLC, Hessian top-eigenvalue, gradient covariance rank). Behavioral metrics include accuracy, log-probability, and probe accuracy. Multiple concordance and correlation measures are used for evaluation."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a computational study of neural network training dynamics with no human-generated outputs to evaluate. Human evaluation is not relevant to the claims."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses a 'fixed test set of 1,000 examples each (24,000 total)' for evaluation at every checkpoint (Section 3.1). Probing uses '80/20 stratified train/test splits' (Section 3.3)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by task (8 tasks), difficulty level (3 levels), model size (5 sizes), and difficulty class (easy vs. hard). Tables 5, 7, 9, 13 provide extensive per-category breakdowns."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Failure cases are discussed extensively: the Pythia negative result (Section 4.7), the swap test failure at 26% (Section 4.6), within-easy concordance at chance level (52%), inconclusive freeze results at small scale (Section 5), and the 4 exceptions to top-down propagation."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Multiple negative results are reported prominently: LLC shows 0/24 precursor rate (Section 4.5), geometric prediction fails within difficulty classes (Section 4.6), task-specific precursor signals do not transfer to Pythia (Section 4.7), Fisher-LLC proxy fails (rho=0.32), and freeze experiments are inconclusive at small scale."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "All abstract claims are supported: the 119/120 emergence events (Section 4.1), scale-invariant collapse floors with CV=0.08 (Section 4.2), 28/32 top-down consistency (Section 4.3), 100% precursor rate for hard tasks (Section 4.5), 0/24 LLC precursor (Section 4.5), within-class concordance ranges (Section 4.6), and the Pythia boundary condition (Section 4.7)."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper is careful about causal language. Temporal precedence claims are based on cross-correlation analysis. The one explicit causal test (freeze experiment, Section 5) is honestly described as 'suggestive but not definitive' with the nano vs. small discrepancy reported. The paper states 'We establish temporal precedence; the freeze experiment provides suggestive directional evidence but not definitive causation.'"
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Generalization is explicitly bounded: 'Our contribution is the geometric anatomy of emergence and its boundary conditions, not a prediction tool' (abstract). Limitations include task scope (8 algorithmic tasks simpler than natural language), model scale (85M max), correlational evidence, and Pythia negative results. The abstract itself states what does NOT transfer."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Alternative explanations are discussed substantively: for top-down propagation, gradient proximity is proposed as a mechanism (Section 5); for the nano freeze result, confounding with 'freezing half the model' is discussed (Section 5); for recovery-emergence correlation, 'shared dependence on task difficulty rather than a causal relationship' is stated (Section 4.2); and for scale-invariant floors, the alpha_req artifact is exposed (Appendix N)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Custom algorithmic models are fully specified with architecture details (Table 2). Pythia models are identified by exact name: 'Pythia-deduped models: 160M, 410M, 2.8B' (Section 3.4), which are specific publicly released model versions from Biderman et al. (2023)."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The task format is fully specified: 'TASK input = output' with character-level tokenization (Section 3.1, Appendix A.1). Pythia diagnostic prompts are described with examples in Table 4 (e.g., 'The keys to the cabinet is/are', 'man : woman :: king : ___'). The exact format for each of the 8 algorithmic tasks is given in Table 1 and Appendix A.1."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Full hyperparameters are reported in Table 8 (Appendix A.2): AdamW with beta1=0.9, beta2=0.95, peak LR, warmup steps, batch size, weight decay, gradient clipping. LLC hyperparameters are in Appendix O: n_steps=500, eta=10^-5, beta=1.0, gamma=10000."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a study of neural network training dynamics using standard training and evaluation pipelines."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Data generation is fully documented in Appendix A.1 with exact specifications for each task and difficulty level. Training data is generated on-the-fly with uniform sampling. Diagnostic sets for Pythia are described in Table 4 with sample sizes. The checkpoint schedule is precisely specified in Section 3.1 and Appendix A.3."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "A dedicated 'Limitations' subsection in the Discussion (Section 5) lists six specific limitations with numbered items covering task scope, model scale, correlational evidence, task count, gradient covariance bias, and Pythia diagnostic set size."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Threats are highly specific: '8 algorithmic tasks, while spanning a genuine complexity hierarchy, are simpler than natural language tasks' (Limitation 1); 'Our largest model (85M parameters) is small by current standards' (Limitation 2); 'the effective number of independent observations is ~40' (Limitation 4); gradient covariance is 'biased toward embedding weights and early layers' (Limitation 5)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Explicit scope boundaries are stated: 'Our contribution is the comprehensive geometric anatomy... together with honest reporting of where geometric prediction succeeds and where it fails. We provide the map, not the destination' (Section 1). Section 4.6 and 4.7 are entirely devoted to where the approach fails. Falsifiable predictions in Section 5 explicitly state what would refute the findings."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data (checkpoint activations, geometric measures, emergence timing data) is released or made available for download. Only aggregated statistics are reported in the paper."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Data collection is described in detail: algorithmic data is generated on-the-fly with specified distributions (Section 3.1, Appendix A.1), Pythia checkpoints are from Biderman et al. (2023), diagnostic sets are described with construction methodology and sizes in Table 4."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data is generated algorithmically or uses public model checkpoints (Pythia). Standard benchmark NA."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from training through checkpointing, evaluation, geometric measure computation, and analysis is documented across Sections 3.1-3.5. The checkpoint schedule, evaluation procedure (1,000 examples per task per checkpoint), and each geometric measure's computation are specified."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding information or acknowledgments section is present in the paper. The author is listed as an 'Unaffiliated researcher' with a footnote noting previous affiliations (ISI@USC, Yahoo, Nuance, BBN), but no funding disclosure is provided."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The author's affiliation is disclosed as 'Unaffiliated researcher' with previous affiliations listed in a footnote: 'previously at ISI@USC, Yahoo, Nuance, BBN.' The paper does not evaluate any commercial product, so no product-affiliation conflict arises."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "The author appears to be an unaffiliated independent researcher with no disclosed funding source. This is plausibly unfunded personal research, making the question not applicable."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial disclosure is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark in the contamination-relevant sense. The algorithmic models are trained from scratch on synthetic data, and the Pythia analysis uses the models' own training checkpoints to study geometric dynamics rather than benchmark performance."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not applicable. The algorithmic experiment uses on-the-fly generated synthetic data for both training and evaluation. The Pythia diagnostic sets are external probes explicitly described as 'not necessarily aligned with Pythia's training distribution' (Section 3.4). Contamination in the conventional benchmark sense is not a concern."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable. The study trains models from scratch on synthetic data (algorithmic tasks) or analyzes training dynamics of existing models (Pythia). No benchmark contamination risk exists for the claims being made."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Per-checkpoint computation costs are reported in Table 3 (e.g., RANKME ~1s, Fisher ~85s, LLC ~11s, Hessian ~30s) and the paper explicitly notes that 'costs scale roughly linearly with parameter count.'"
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "A detailed compute budget breakdown is provided in Table 11 (Appendix I): ~209 total GPU-hours on a single NVIDIA RTX 3090 Ti, broken down by component (training ~50h, Fisher ~35h, Hessian ~40h, etc.)."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Training begins with a universal representation collapse to task-specific floors that are scale-invariant across a 210x parameter range (e.g., modular arithmetic collapses to RANKME ~2.0 regardless of model size, CV=0.08).",
    286       "evidence": "Section 4.2: MOD collapse floor 2.12 +/- 0.17 (CV=0.08) across five model sizes (405K-85M). ADD: 3.93 +/- 0.70 (CV=0.18). MUL: 5.25 +/- 2.18 (CV=0.42). Figure 2 shows trajectories.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Representation collapse propagates top-down through layers, with output-facing layers collapsing most, contradicting bottom-up feature-building intuition.",
    291       "evidence": "Section 4.3: 28/32 task x model combinations show top-down gradient. First-to-last-layer RANKME reduction ranges 30-84%. Quantitative examples: MOD micro layer 0 = 8.2 vs layer 3 = 1.7 (80% reduction).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "RANKME (representation geometry) leads behavioral emergence at 100% precursor rate for hard tasks across all five model sizes, while LLC is synchronous (0/24 precursor) and Hessian measures lag.",
    296       "evidence": "Section 4.5, Table 7: RANKME 75% overall (micro), 100% for hard tasks at all scales (Table 13). LLC 0/24 precursor. Hessian 17% precursor. Cross-correlation analysis on first-differenced time series.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Geometric measures encode coarse task difficulty but do not achieve fine-grained prediction of emergence timing.",
    301       "evidence": "Section 4.6: Cross-class concordance 93%, but within-easy 52% (chance level), within-hard 69%, swap test 26% (below chance). Bootstrap 95% CIs provided for all rates.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "On Pythia language models, global geometric patterns replicate but per-task precursor signals do not transfer, because the precursor relationship requires task-training alignment absent in naturalistic pre-training.",
    306       "evidence": "Section 4.7: Collapse replicates (50-90% RANKME drop), RANKME ordering preserved (rho=1.0 between 160M and 410M), but task-specific RANKME does not consistently precede behavioral emergence. No differential signal around the arithmetic phase change at step ~65K in 2.8B.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Linear probes show hidden learning is universal: representations encode correct answers before behavioral emergence, and this learning concentrates in deeper layers.",
    311       "evidence": "Section 4.4, Table 6: All 8 tasks across all 5 model sizes show probe accuracy exceeding behavioral accuracy before emergence. Deep-to-shallow ratio 2-11x for hidden learning magnitude.",
    312       "supported": "strong"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval", "theoretical"],
    316   "key_findings": "The paper establishes a geometric anatomy of capability emergence in neural networks through controlled experiments with 120 task/level/model combinations. Key findings include scale-invariant representation collapse floors (MOD collapses to RANKME ~2.0 regardless of model size), top-down layer propagation of collapse (28/32 consistency), and a temporal hierarchy where representation geometry (RANKME) leads emergence for hard tasks (100% precursor rate) while loss landscape measures are synchronous or lagging. Critically, the paper honestly delineates where prediction fails: within-difficulty-class concordance drops to chance levels, and per-task precursor signals do not transfer to naturalistic pre-training (Pythia).",
    317   "red_flags": [
    318     {
    319       "flag": "No code or data release",
    320       "detail": "Despite the paper's emphasis on reproducibility and detailed method descriptions, no code, data, or computed geometric measures are released. The experiments involve substantial computation (~209 GPU-hours) that others would need to fully replicate."
    321     },
    322     {
    323       "flag": "Small model scale",
    324       "detail": "The largest custom model is 85M parameters and the largest Pythia model analyzed is 2.8B with only 17 targeted checkpoints. Whether findings transfer to frontier-scale models (>7B) is unknown, which limits the paper's relevance to safety monitoring claims in the conclusion."
    325     },
    326     {
    327       "flag": "Single author with no funding disclosure",
    328       "detail": "A single unaffiliated author with no funding or competing interests statement. While not inherently problematic, the absence of any disclosure statement is noted."
    329     }
    330   ],
    331   "cited_papers": [
    332     {
    333       "title": "Emergent abilities of large language models",
    334       "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani", "Colin Raffel", "Barret Zoph", "Sebastian Borgeaud", "Dani Yogatama", "Maarten Bosma", "Denny Zhou", "Donald Metzler", "Ed H. Chi", "Tatsunori Hashimoto", "Oriol Vinyals", "Percy Liang", "Jeff Dean", "William Fedus"],
    335       "year": 2022,
    336       "relevance": "Foundational paper identifying emergent abilities in LLMs as sudden capability transitions with scale, which this paper mechanistically investigates."
    337     },
    338     {
    339       "title": "Are emergent abilities of large language models a mirage?",
    340       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    341       "year": 2023,
    342       "relevance": "Argues apparent emergence is an artifact of nonlinear evaluation metrics, motivating this paper's use of continuous log-probability alongside accuracy."
    343     },
    344     {
    345       "title": "Pythia: A suite for analyzing large language models across training and scaling",
    346       "authors": ["Stella Biderman", "Hailey Schoelkopf", "Quentin Gregory Anthony"],
    347       "year": 2023,
    348       "relevance": "Provides the publicly available model checkpoints used for the Pythia generalization probe in this paper."
    349     },
    350     {
    351       "title": "Progress measures for grokking via mechanistic interpretability",
    352       "authors": ["Neel Nanda", "Lawrence Chan", "Tom Lieberum", "Jess Smith", "Jacob Steinhardt"],
    353       "year": 2023,
    354       "relevance": "Identifies interpretable progress measures tracking grokking in modular arithmetic, directly related to this paper's modular arithmetic task analysis."
    355     },
    356     {
    357       "title": "The developmental landscape of in-context learning",
    358       "authors": ["Jesse Hoogland", "George Wang", "Matthew Farrugia-Roberts", "Liam Carroll", "Susan Wei", "Daniel Murfet"],
    359       "year": 2024,
    360       "arxiv_id": "2402.02364",
    361       "relevance": "Applies LLC to detect developmental stages in GPT-2 training; this paper's finding that LLC is synchronous (not precursor) in multi-task settings provides an important boundary condition."
    362     },
    363     {
    364       "title": "Tracing the representation geometry of language models from pretraining to post-training",
    365       "authors": ["Melody Zixuan Li", "Kumar Krishna Agrawal", "Arna Ghosh", "Komal Kumar Teru", "Adam Santoro", "Guillaume Lajoie", "Blake A. Richards"],
    366       "year": 2025,
    367       "arxiv_id": "2509.23024",
    368       "relevance": "Tracks eigenspectrum of hidden-state covariance matrices in Pythia training and identifies three developmental phases; this paper extends the analysis to test whether geometric changes predict capability emergence."
    369     },
    370     {
    371       "title": "The local learning coefficient: A singularity-aware complexity measure",
    372       "authors": ["Edmund Lau", "Zach Furman", "George Wang", "Daniel Murfet", "Susan Wei"],
    373       "year": 2025,
    374       "relevance": "Develops the LLC estimator used in this paper's geometric hierarchy analysis; this paper finds LLC is synchronous with emergence in multi-task settings."
    375     },
    376     {
    377       "title": "Neural networks learn statistics of increasing complexity",
    378       "authors": ["Nora Belrose", "Quintin Pope", "Lucia Quirke", "Alex Mallen", "Xiaoli Z. Fern"],
    379       "year": 2024,
    380       "relevance": "Demonstrates progressive complexity learning during training, aligning with this paper's finding of representation diversification post-collapse."
    381     },
    382     {
    383       "title": "Understanding emergent abilities of language models from the loss perspective",
    384       "authors": ["Shichao Du", "Yile Wang", "Zhuosheng Zhang", "Hai Zhao"],
    385       "year": 2024,
    386       "arxiv_id": "2403.15796",
    387       "relevance": "Prior demonstration of emergence mirage through loss-based reanalysis; this paper provides a complementary within-training-run demonstration."
    388     },
    389     {
    390       "title": "Rankme: Assessing the downstream performance of pretrained self-supervised representations by their rank",
    391       "authors": ["Quentin Garrido", "Randall Balestriero", "Laurent Najman", "Yann LeCun"],
    392       "year": 2023,
    393       "relevance": "Introduces the RANKME metric (effective dimensionality via entropy of normalized singular values) that is the primary geometric measure in this paper."
    394     }
    395   ]
    396 }

Impressum · Datenschutz