ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27594B)


      1 {
      2   "paper": {
      3     "title": "KORMo: Korean Open Reasoning Model for Everyone",
      4     "authors": ["Minjun Kim", "Hyeonseok Lim", "Hangyeol Yoo", "Inho Won", "Seungwoo Song", "Minkyung Cho", "Junghun Yuk", "Changsu Choi", "Dongjae Shin", "Huije Lee", "Hoyun Song", "Alice Oh", "KyungTae Lim"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.09426",
      8     "doi": "10.48550/arXiv.2510.09426"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "KORMo-10B is a 10.8B-parameter fully open bilingual Korean-English LLM trained from scratch with 68.7% synthetic Korean data. The paper demonstrates that synthetic data does not cause model collapse when diversity is maintained across synthesizers, but using a single synthesizer leads to severe degradation. The model achieves competitive performance against open-weight multilingual baselines (64.2 English avg, 58.2 Korean avg) with only 2.9T training tokens, and scores 8.61 on instruction-following benchmarks without reinforcement learning.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The abstract states 'All model checkpoints, datasets, and source codes are publicly available at huggingface.co/kormo-lm.' The paper emphasizes full FOM (Fully Open Model) release throughout."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper explicitly states all datasets are released at huggingface.co/kormo-lm. Table 10 references multiple publicly available datasets and states underlined datasets are 'generated synthetic data produced in-house' which are released."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions NVIDIA H200 GPUs, 128 GPUs, FSDP, bfloat16, and flash-attention-3, but does not provide a requirements.txt, Dockerfile, or detailed library versions for environment reproduction."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "While training configurations are detailed in Tables 1 and 13, the paper does not provide step-by-step reproduction instructions, README with commands, or scripts to replicate experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All benchmark results (Tables 2-4, 7-8, 11, 16-18, 21-22) report only point estimates with no confidence intervals, error bars, or ± notation."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper makes many comparative claims (e.g., 'Pre-LN consistently outperforms MixLN', 'KORMo achieved the highest average score') but no statistical significance tests are performed."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports percentage point differences with baseline context throughout, e.g., '+4.45 pt (60.15→64.60)' in Section 5.2.1, '+6.88pt increase in average score (43.43→50.31)' for Korean benchmarks."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is given for the number of benchmarks selected, the 60B token proxy training budget, or why specific evaluation splits were chosen."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or multi-run results are reported. All experiments appear to be single-run with no spread measures."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Table 21 compares KORMo against SmolLM3-3B, OLMo2-7B/13B, KANANA-8B, Qwen3-8B, LLaMA3.1-8B, Gemma3-4B/12B. Table 22 compares instruction-tuned models similarly."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include very recent models: Qwen3-8B (2025), SmolLM3-3B (2025), Gemma3-12B (2025), OLMo2-13B (2025), KANANA-1.5 (2025)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Extensive ablations in Section 2: normalization methods (Table 2), attention masking (Table 3), NTP vs MTP (Table 4), tokenizer variants (Tables 6-8), deduplication strategies (Table 11), quality filtering versions (Table 12), and synthetic data diversity (Figure 8)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Over 26 benchmarks used across English and Korean, covering reasoning, knowledge, domain-specific, and instruction-following tasks (Table 20)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of model outputs. The MT-Bench, KO-MT-Bench, and Logickor evaluations use GPT-4o as an automated judge, not human raters."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper uses established external benchmarks (MMLU, ARC, HellaSwag, KMMLU, etc.) that are separate from training data. Proxy experiments use different data splits than final model training."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per benchmark across English general reasoning, English knowledge, Korean general reasoning, and Korean knowledge categories (Tables 16-18, 21-22)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 4.3 and Figure 8 explicitly show and analyze a failed training case with single-synthesizer data. Section 7.2 discusses weaknesses on KOBALT, MMLU-Pro, and KMMLU. Korean NIAH degradation beyond 13K tokens is analyzed."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "MTP underperforms NTP at 1B scale (Table 4). MixLN underperforms Pre-LN (Table 2). 196K vocabulary underperforms 125K on downstream tasks. Single-synthesizer data causes model collapse (Figure 8). Korean NIAH degrades beyond 13K tokens."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about synthetic data stability (supported by Tables 2-3, Figure 8), comparable performance to baselines (supported by Table 21), and bilingual instruction tuning (supported by Table 22) are all backed by experimental results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims are primarily from controlled ablations: synthetic vs non-synthetic under same conditions (Tables 2-3), normalization methods (Table 2), attention masking (Table 3), single vs multi-synthesizer (Figure 8). These are controlled single-variable manipulations."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The abstract claims to present 'a transparent framework for developing synthetic data-driven fully open models in low-resource settings' but only tests Korean-English. Section 2.2 acknowledges proxy-stage limitations but the broader framing of 'non-English FOM' from a single language pair is overbroad."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 2.4 discusses why MTP underperforms (model capacity, data scale). Section 4.3 analyzes causes of model collapse (single-model bias, prompt uniformity). Section 5.1 discusses Korean NIAH degradation factors (tokenizer instability, data imbalance, domain bias)."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures specific benchmark scores and reports them as such. Claims are framed in terms of benchmark performance rather than broader capabilities. The proxy model approach is explicitly acknowledged as a proxy (Section 2.1: 'we interpret the performance trends observed at the proxy stage with a focus on relative comparison')."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific model versions used for synthesis are stated: Qwen3-30B-A3B, Qwen3-235B-A22B, Qwen3-Next-80B-A3B-Instruct, GPT-oss (120B), Mixtral-8x7B-Instruct-v0.1, Mistral-Nemo-12B-Instruct, DeepSeek-R1, QwQ-32B (Tables 10, 15, 19)."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper describes prompt strategies in natural language (e.g., 'appending the instruction \"answer in Korean\"', 'custom-designed prompts') but does not provide the actual full prompt text used for synthetic data generation. Only Figure 11 shows a simple MMLU evaluation prompt."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Tables 1 and 13 provide detailed model architecture configs. Section 4.2.1 documents learning rate search. Training details include batch size 1024, sequence length 4096, weight decay policy, warm-up schedule, RoPE theta, RMSNorm epsilon, etc."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. This is a standard LLM pretraining and fine-tuning paper."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 4.1.3 provides detailed 3-stage filtering pipeline: 8 heuristic filters with specific thresholds, Bloom Filter deduplication with cross-corpus details, and quality filtering with classifier training procedure. Token counts at each stage are provided in Tables 10-12."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. Some limitations are mentioned inline (e.g., Section 2.2 on proxy-scale caveats, Section 7.2 on weaknesses) but no consolidated discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed. While weaknesses are noted in results discussion, there is no systematic consideration of threats to validity of the experimental methodology."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 7.2 explicitly identifies weaknesses: 'relative weaknesses on (a) high-difficulty specialist knowledge benchmarks (e.g., MMLU-Pro, the KMMLU family) and (b) lexical semantic discrimination tasks (e.g., KOBALT).' Section 2.2 acknowledges proxy-stage results may not transfer to larger scales."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The paper states all datasets are released at huggingface.co/kormo-lm, including synthetic data, training data, and checkpoints. Table 10 lists all data sources with links."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4.1 describes data collection in detail: public English sources (Section 4.1.1), Korean public data from three sources (Korean Opensource, Ko-Web, Ko-CC-Dump), and synthetic generation procedures (Section 4.1.2)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data sources are standard public corpora and synthetic generation."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 4.1.3 documents the full pipeline: heuristic filtering (8 steps with thresholds), deduplication (BFF with cross-corpus sequence), quality filtering (4 classifier versions with composition details in Table 12), with filtering ratios discussed (e.g., Old-both removed ~70% of Korean corpus)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgments section states: 'This work was supported by Institute of Information & communications Technology Planning & Evaluation (IITP) grant funded by the Korea government(MSIT) (RS-2025-02653113).'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: KAIST MLP Lab, KAIST NLPCL Lab, KAIST U&I Lab, and SeoulTech. Special thanks acknowledge Trillion Labs and LG U+ contributions."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "IITP is a Korean government research funding agency with no commercial stake in the model's benchmark performance."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is provided. The acknowledgments mention Trillion Labs and LG U+ contributions, but no formal financial interests declaration exists."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The Ko-CC-Dump uses WARC files from March-May 2025, providing some temporal information, but no explicit training data cutoff date is stated for the overall model or the public datasets used."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether benchmark examples (MMLU, ARC, HellaSwag, etc.) appear in the training data (DCLM, UltraFineWeb, Nemotron-CC, etc.)."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The paper uses well-known benchmarks (MMLU from 2020, ARC from 2018, HellaSwag from 2019) with training data that could contain them, but contamination is not discussed. The only mention is using KR-Clinical-QA to 'mitigate potential data contamination' (Section 7.1) but no systematic analysis."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, latency, or tokens-per-second metrics are reported for the final model."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "The paper states 128 NVIDIA H200 GPUs were used but does not report total GPU hours, training time, or compute cost for any training stage."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No multi-seed results are reported. All experiments appear to be single runs."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is never stated. Results appear to be from single runs without explicit confirmation."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "Section 4.2.1 reports learning rate search over 8 candidates {1e-4, 3e-4, 5e-4, 7e-4, 9e-4, 1e-3, 1.5e-3, 3e-3} with 2000 steps each. Figure 6 shows all loss curves."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Section 4.2.1 justifies selection of 7e-4 learning rate based on lowest loss across all steps (Figure 6). Tokenizer selection in Section 3.4 is justified by downstream performance comparison."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests are performed, let alone corrections for multiple comparisons across the many benchmark comparisons."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No acknowledgment of author-evaluation bias. The authors evaluate their own model against baselines without discussing this potential bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Section 7.2 explicitly discusses token budget vs performance: 'Given its relatively modest pretraining corpus of 2.9T tokens, these results highlight KORMo-10B's high language modeling efficiency.' Table 21 shows training token counts for all models compared."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the benchmarks actually measure the claimed capabilities. The paper uses many benchmarks without questioning their construct validity."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved in the evaluation. Models are directly evaluated on benchmarks."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage. Many benchmarks (MMLU 2020, ARC 2018, HellaSwag 2019) predate the training data which includes 2025 web crawls."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether evaluation setups leak information. The few-shot evaluation prompts could theoretically provide hints not available in real usage."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether training corpora (DCLM, UltraFineWeb, Nemotron-CC) share overlap with evaluation benchmarks."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection method is applied. The only mention is using the recently released KR-Clinical-QA to 'mitigate potential data contamination' (Section 7.1), but no decontamination pipeline or detection method is used."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Synthetic data does not cause instability or degradation during large-scale pretraining when carefully curated with balanced linguistic coverage and diverse instruction styles.",
    365       "evidence": "Tables 2-3 show synthetic data outperforms or matches web data under same conditions. However, Figure 8 shows single-synthesizer data causes model collapse. Evidence is from 1B proxy model and 10.8B model (Figures 7-8).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "KORMo-10B achieves performance comparable to contemporary open-weight multilingual baselines.",
    370       "evidence": "Table 21 shows 64.2 English avg vs Qwen3-8B 68.55, Gemma3-12B 67.72; Korean avg 58.15 vs KANANA-8B 60.94, Qwen3-8B 63.35. Competitive but not matching the best baselines.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Bilingual instruction tuning enables near-native reasoning and discourse coherence in Korean.",
    375       "evidence": "Table 22 shows KO-MT-Bench score of 8.54 and Logickor 8.96, outperforming KANANA (8.02, 8.94) and Qwen3 (8.16, 8.63) on Korean instruction-following. GPT-4o used as judge.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Using augmented data from a single synthesizer causes model collapse, but diverse multi-synthesizer data avoids this.",
    380       "evidence": "Figure 8 shows Stage 2 (Failure) with single Qwen3-30B-A3B synthesizer vs successful Stage 2 with diverse synthesizers. Clear degradation visible in both English and Korean performance curves.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Pre-LN consistently outperforms MixLN for normalization.",
    385       "evidence": "Table 2 shows Pre-LN average 43.38% vs MixLN 41.28% on 1B proxy model. Only one configuration comparison with no variance or significance testing.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Intra-document masking is the most effective attention masking strategy.",
    390       "evidence": "Table 3 shows Intra-doc 44.48% vs Causal 43.38%, Sliding causal 43.33%, Sliding Intra-doc 43.59%. Differences are small with no statistical testing.",
    391       "supported": "weak"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No variance or significance testing across all experiments",
    397       "detail": "Despite numerous comparative claims across Tables 2-4, 7-8, 11, 16-18, 21-22, all results are single-run point estimates. Differences of 1-2% are treated as meaningful without any statistical validation. The ablation differences (e.g., 43.38% vs 41.28% for normalization) could easily be within run-to-run variance."
    398     },
    399     {
    400       "flag": "No contamination analysis despite using well-known benchmarks",
    401       "detail": "The model is trained on web-crawled data (including 2025 Common Crawl dumps) and evaluated on benchmarks from 2018-2022 (ARC, HellaSwag, MMLU). No decontamination pipeline or overlap analysis is performed. The brief mention of using KR-Clinical-QA to 'mitigate contamination' is insufficient."
    402     },
    403     {
    404       "flag": "No compute budget reported",
    405       "detail": "128 H200 GPUs were used for training ~3.5T tokens but total GPU hours, wall-clock time, and training cost are never stated. For a paper about training efficiency and FOM accessibility, this is a notable omission."
    406     },
    407     {
    408       "flag": "LLM-as-judge evaluation without validation",
    409       "detail": "MT-Bench, KO-MT-Bench, and Logickor evaluations all use GPT-4o as the sole judge. No inter-rater reliability, no human validation of the judge, and no discussion of potential biases in GPT-4o judging a model partly trained on synthetic data from Qwen models."
    410     },
    411     {
    412       "flag": "Overclaiming from proxy experiments",
    413       "detail": "Key design decisions (normalization, attention masking, MTP vs NTP) are made from 1B proxy models trained on 60B tokens, then applied to 10.8B model trained on 2.9T tokens. The paper acknowledges this limitation for some results but still makes broad claims about the findings."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "2 OLMo 2 Furious",
    419       "authors": ["Pete Walsh", "Luca Soldaini", "Dirk Groeneveld"],
    420       "year": 2025,
    421       "arxiv_id": "2501.00656",
    422       "relevance": "Key fully open model baseline; establishes FOM practices for reproducible LLM training."
    423     },
    424     {
    425       "title": "The Llama 3 Herd of Models",
    426       "authors": ["Aaron Grattafiori"],
    427       "year": 2024,
    428       "arxiv_id": "2407.21783",
    429       "relevance": "Major open-weight model family used as architecture reference and baseline."
    430     },
    431     {
    432       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    433       "authors": ["DeepSeek-AI"],
    434       "year": 2025,
    435       "arxiv_id": "2501.12948",
    436       "relevance": "Reasoning model whose data generation approach influenced KORMo's reasoning data construction."
    437     },
    438     {
    439       "title": "Qwen3 Technical Report",
    440       "authors": ["Qwen Team"],
    441       "year": 2025,
    442       "arxiv_id": "2505.09388",
    443       "relevance": "Primary synthesizer model family used for generating KORMo's Korean synthetic data."
    444     },
    445     {
    446       "title": "SmolLM3: smol, multilingual, long-context reasoner",
    447       "authors": ["Elie Bakouch", "Loubna Ben Allal"],
    448       "year": 2025,
    449       "relevance": "Fully open model baseline; APO/preference learning strategy adopted from this work."
    450     },
    451     {
    452       "title": "The curse of recursion: Training on generated data makes models forget",
    453       "authors": ["Ilia Shumailov"],
    454       "year": 2023,
    455       "relevance": "Core concern addressed by KORMo: whether synthetic data causes model collapse in LLM pretraining."
    456     },
    457     {
    458       "title": "Textbooks Are All You Need",
    459       "authors": ["Suriya Gunasekar"],
    460       "year": 2023,
    461       "relevance": "Pioneering work on synthetic textbook-style data for LLM training, foundational to KORMo's approach."
    462     },
    463     {
    464       "title": "Nemotron-CC: Transforming Common Crawl into a Refined Long-Horizon Pretraining Dataset",
    465       "authors": ["Dan Su"],
    466       "year": 2025,
    467       "arxiv_id": "2412.02595",
    468       "relevance": "Major data source for KORMo's pretraining; provides synthetic and web data at scale."
    469     },
    470     {
    471       "title": "Nemotron-4 340B Technical Report",
    472       "authors": ["Nvidia"],
    473       "year": 2024,
    474       "arxiv_id": "2406.11704",
    475       "relevance": "Industrial-scale synthetic data pipeline that informed KORMo's data augmentation approach."
    476     },
    477     {
    478       "title": "BLOOM: A 176B-Parameter Open-Access Multilingual Language Model",
    479       "authors": ["BigScience Workshop"],
    480       "year": 2023,
    481       "arxiv_id": "2211.05100",
    482       "relevance": "Prominent multilingual FOM predecessor; demonstrates community-driven open model development."
    483     },
    484     {
    485       "title": "DCLM: In Search of the Next Generation of Training Sets for Language Models",
    486       "authors": ["Jeffrey Li"],
    487       "year": 2024,
    488       "relevance": "Major English pretraining data source and filtering methodology adopted by KORMo."
    489     },
    490     {
    491       "title": "Measuring Massive Multitask Language Understanding",
    492       "authors": ["Dan Hendrycks"],
    493       "year": 2020,
    494       "relevance": "MMLU benchmark used as primary English knowledge evaluation in KORMo."
    495     }
    496   ]
    497 }

Impressum · Datenschutz