scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28642B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "KORMo: Korean Open Reasoning Model for Everyone",
      6     "authors": [
      7       "Minjun Kim",
      8       "HyeonSeok Lim",
      9       "Hangyeol Yoo",
     10       "Inho Won",
     11       "Seungwoo Song"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2510.09426",
     16     "doi": "10.48550/arXiv.2510.09426"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Key abstract claims (68.74% synthetic Korean data, no instability, comparable performance to multilingual baselines) are backed by Tables 2–4 proxy ablations and Tables 21–22 final model evaluations.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims (Pre-LN > MixLN, intra-doc masking > causal masking, NTP > MTP) are supported by controlled proxy ablation experiments varying one factor at a time with identical training budgets.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The conclusion states 'KORMo validates that synthetic data is not only viable but also scalable as a principal resource for non-English FOMs,' generalizing from a single Korean-English model to all non-English settings without sufficient supporting evidence.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Performance improvements from tokenizer design and synthetic data are attributed solely to the proposed choices without considering alternatives; e.g., tokenizer gains are attributed to mixture design rather than domain familiarity.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The abstract claims 'near-native reasoning and discourse coherence in Korean' while the evidence is standardized multiple-choice benchmark accuracy; this distinction between benchmark performance and actual discourse quality is never discussed.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section; the paper's sections are Introduction, Training Design, Tokenizer, Pretraining, Mid-training, Post-training, Experiments, Conclusion.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Specific threats such as proxy-to-target scale transfer gaps and LLM-as-judge bias are not systematically discussed; brief mentions (e.g., 'these results were obtained from a 1B parameter proxy model') are scattered and not treated as threats.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Section 7.2 notes weaknesses on specialist knowledge and lexical discrimination tasks, but these are framed as future improvement targets rather than explicit boundaries on what the results do not show.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Funding is disclosed in the Acknowledgments: 'IITP grant funded by the Korea government(MSIT) (RS-2025-02653113, High-Performance Research AI Computing Infrastructure Support at the 2 PFLOPS Scale)'.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly stated on the title page: KAIST MLP Lab, KAIST NLPCL Lab, KAIST U&I Lab, and SeoulTech.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The Korean government (IITP) provides computational infrastructure support and has no direct stake in KORMo's benchmark performance or competitive outcomes.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "There is no competing interests statement or financial disclosure in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are explicitly defined in Section 1: 'Fully Open Model (FOM)' vs 'Open-Weight Model (OWM)' are contrasted with specifics on what each discloses; 'synthetic data' and 'model collapse' are used with definitions or citations.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit contributions are enumerated in Section 1: (1) first systematic FOM in non-English with majority synthetic data, (2) practical tokenizer/training guidelines, (3) full release of 10.8B Korean-English model with all components.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper engages extensively with OLMo (FOM precedent), Nemotron-CC and Cosmopedia (synthetic data methods), Shumailov et al. (model collapse risk), and multilingual baselines (Qwen3, Gemma3, LLaMA3), showing how KORMo relates to and builds on each.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The abstract states 'All model checkpoints, datasets, and source codes are publicly available at huggingface.co/kormo-lm' and release is framed as a core contribution of the paper.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The paper claims all training data is released, including in-house synthetic datasets (Synth-FineWeb2, Kosmopedia, Ko-Reasoning), alongside model weights and training recipes.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions H200 GPUs, FSDP, and FlashAttention-3 but provides no requirements.txt, Dockerfile, or equivalent environment specification file within the paper.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Training procedures are described in narrative detail across multiple sections but there are no step-by-step reproduction commands, scripts, or structured README within the paper itself.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All benchmark results across Tables 2–4, 7–8, 11, 16–18, 21–22 are reported as point estimates only without confidence intervals or error bars.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to any comparative benchmark results; all conclusions rely on raw performance differences.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Absolute point improvements are consistently reported with baseline context (e.g., '+4.45 pt Stage1→Stage2', '+5.05-point gain', '+6.88pt average') throughout Section 5.2.1.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Proxy experiments use a 1B model on 60B tokens without justification for why this size and token count are sufficient to reliably predict 10.8B behavior.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or results across multiple training runs are reported; all results are single-run point estimates.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Tables 21–22 compare KORMo-10B against eight external models: SmolLM3-3B, OLMo2-7B, OLMo2-13B, KANANA1.5-8B, Qwen3-8B, LLaMA3.1-8B, Gemma3-4B, Gemma3-12B, and Exaone3.5-8B.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "All comparison baselines are 2024–2025 models representing the current state of the art in fully-open and open-weight multilingual LLMs.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Extensive ablations in Sections 2–3 compare normalization methods (Table 2), attention masking strategies (Table 3), training objectives (Table 4), tokenizer configurations (Tables 7–8), and deduplication strategies (Table 11).",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Over 26 benchmarks are used spanning general reasoning, knowledge, reading comprehension, commonsense, mathematics, Korean domain-specific tasks, and instruction-following (MT-Bench, Ko-MT-Bench, Logickor).",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "No human evaluation is conducted; instruction-following quality is assessed via GPT-4o as an automated judge rather than human raters.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "All evaluation benchmarks (MMLU, ARC, KoBEST, KMMLU, etc.) use standard held-out test sets not included in training data.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by language (English vs Korean), training stage (Stage1/Stage2/Midtrain-Long/Midtrain-Reason), and benchmark domain category in Tables 16–18.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "The 'Stage 2 (Failure)' experiment in Figure 8 explicitly demonstrates catastrophic model collapse from single-synthesizer synthetic data with quantitative comparison and causal analysis.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Multiple negative results are explicitly reported: MTP underperforms NTP at 1B scale (Table 4), MixLN underperforms Pre-LN (Table 2), 196K vocabulary reduces downstream performance vs 125K (Section 3.4).",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "KORMo's full architecture is specified in Tables 1 and 13; comparison baselines are named with specific version identifiers ('qwen3-8b', 'llama3.1-8b', 'gemma3-12b') in Table 21.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "Only the MMLU evaluation prompt is shown (Figure 11); Korean benchmark prompts and most evaluation prompts are referenced as 'standard evaluation prompts proposed by OLMo 2' without direct reproduction.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Extensive hyperparameter reporting throughout: learning rate (7e-4), global batch size (1024), sequence length (4096), warmup (0.03%), full architecture specs in Tables 1 and 13, optimizer initialization details.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "The paper is about LLM pretraining and standard benchmark evaluation; no agentic scaffolding is involved.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 4.1.3 provides highly detailed documentation of the three-stage filtering pipeline: heuristic filtering with 8 specific rules and thresholds, Bloom Filter deduplication, and quality classifier training with four dataset versions.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "The paper claims all components including training datasets are released at HuggingFace, with footnote URLs provided for individual datasets throughout.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 4.1 details data collection from web crawls (WARC parsing with fastText language filtering, threshold=0.8), public datasets, and synthetic generation with specific synthesizer models, seed pools, and prompt strategies.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; all evaluation uses standard benchmarks with no recruitment.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The complete pipeline from collection through heuristic filtering, BFF deduplication, fastText quality classification, and stage-specific curricula is documented with specific thresholds, algorithms, and dataset composition tables.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "Training data temporal bounds are established: Ko-CC-Dump uses WARC dumps 2025-13 through 2025-21, corresponding to web crawls collected between March 15 and May 25, 2025.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Contamination concern is addressed only by selecting KR-Clinical-QA as a newer benchmark; no systematic overlap analysis between training corpora and established benchmarks (MMLU, ARC, KMMLU) is performed.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "Standard benchmarks predating the training cutoff (MMLU, ARC, KMMLU, KoBEST) are not analyzed for contamination; only one benchmark (KR-Clinical-QA) is selected specifically to mitigate this risk.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, latency, or throughput figures are reported for KORMo-10B in deployment.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "The paper mentions 128 H200 GPUs but does not report total GPU-hours, FLOPs, or dollar cost for the full 2.9T token training run.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Synthetic data comprising 68.74% of the Korean pretraining corpus does not cause training instability or performance degradation when sufficiently diverse",
    375       "evidence": "Tables 2–3 show synthetic data matches or exceeds non-synthetic performance at 1B proxy scale; Figure 8 shows that diversity is required (single-synthesizer Synth-Nemo-HQ causes catastrophic collapse)",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Pre-LN with RMSNorm outperforms MixLN for both training stability and downstream performance",
    380       "evidence": "Table 2: Pre-LN achieves 43.38% avg vs MixLN 41.28% avg across 9 benchmarks in controlled 1B proxy experiment",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Intra-document attention masking outperforms standard causal masking for pretraining",
    385       "evidence": "Table 3: Intra-doc achieves 44.48% avg vs causal 43.38% avg; synthetic data further improves to 45.66% under intra-doc",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "NTP outperforms MTP at 1B scale with limited data (60B tokens)",
    390       "evidence": "Table 4: NTP 43.38% avg vs MTP 41.35% avg; MTP shows selective gains on QA tasks but loses on precision tasks (ARC-E, PIQA, RACE)",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "KORMo-10B achieves performance comparable to larger multilingual open-weight models with only 2.9T training tokens",
    395       "evidence": "Table 21: English avg 64.2 (vs OLMo2-13B 65.3, trained on 5.5T); Korean avg 58.15 (vs KANANA1.5-8B 60.94 trained on 3.2T with more Korean data)",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "KORMo-10B achieves highest instruction-following average (8.61) among compared models including Gemma3-12B and Qwen3-8B",
    400       "evidence": "Table 22: KORMo-10B 8.61 avg on MT-Bench/Ko-MT-Bench/Logickor vs Gemma3-12B 8.56, Qwen3-8B 8.50, all evaluated by GPT-4o",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "Insufficient diversity in synthetic data (single synthesizer source) causes severe model collapse in Stage 2",
    405       "evidence": "Figure 8: Stage 2 (Failure) using only Synth-Nemo-HQ shows performance reverting below Stage 1 levels with pronounced oscillations in both English and Korean benchmarks",
    406       "supported": "strong"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval",
    411     "empirical"
    412   ],
    413   "key_findings": "KORMo-10B is the first fully open Korean-English bilingual LLM trained predominantly on synthetic data (68.74% of Korean corpus), demonstrating through systematic 1B proxy ablations and large-scale experiments that carefully curated, diverse synthetic data does not cause model collapse. The paper establishes Pre-LN, intra-document masking, and NTP as optimal pretraining choices for this setting, and shows that diversity of synthetic data sources is essential — single-synthesizer training caused catastrophic collapse. The 10.8B model achieves performance comparable to Gemma3-12B and outperforms OLMo2-13B in English despite using only 2.9T training tokens, and achieves the highest instruction-following average (8.61) among compared models. Key weakness: the model trails specialized Korean models (KANANA) in Korean-language tasks, attributed to lower Korean data proportion (~5.6%).",
    414   "red_flags": [
    415     {
    416       "flag": "Proxy-to-target transfer not validated",
    417       "detail": "All key architectural choices (Pre-LN, intra-doc masking, NTP, tokenizer selection) are based on 1B proxy experiments; the paper only partially validates these choices transfer to 10.8B, and Section 2.2 acknowledges 'a re-evaluation is necessary for models with tens to hundreds of billions of parameters.'"
    418     },
    419     {
    420       "flag": "No statistical significance testing",
    421       "detail": "Many comparative claims rest on 1–2 percentage point differences (e.g., intra-doc 44.48% vs causal 43.38%) without confidence intervals or significance tests, making it unclear which differences are meaningful vs noise."
    422     },
    423     {
    424       "flag": "LLM-as-judge evaluation bias",
    425       "detail": "Instruction-following evaluation (MT-Bench, Ko-MT-Bench, Logickor) uses GPT-4o as judge, introducing potential systematic bias favoring models with similar RLHF-style training distributions."
    426     },
    427     {
    428       "flag": "Preference learning stage incomplete",
    429       "detail": "Section 7.4 (RL stage with APO/GRPO) states 'training and evaluation results will be released in future work,' meaning the full training pipeline described in the abstract is not yet demonstrated in the paper."
    430     },
    431     {
    432       "flag": "Benchmark contamination unaddressed",
    433       "detail": "Standard benchmarks (MMLU, ARC, KMMLU) that predate the May 2025 training cutoff are evaluated without any contamination analysis; the paper selects only one new benchmark (KR-Clinical-QA) to partially mitigate this risk."
    434     }
    435   ],
    436   "cited_papers": [
    437     {
    438       "title": "2 OLMo 2 Furious",
    439       "relevance": "Primary precedent for the Fully Open Model (FOM) approach with complete release of training pipeline, code, data, and logs; KORMo explicitly follows this disclosure standard"
    440     },
    441     {
    442       "title": "The Curse of Recursion: Training on Generated Data Makes Models Forget",
    443       "relevance": "Core concern directly addressed by KORMo's empirical validation of synthetic data stability; motivates the paper's research questions RQ1 and RQ3"
    444     },
    445     {
    446       "title": "AI Models Collapse When Trained on Recursively Generated Data (Nature 2024)",
    447       "relevance": "Nature paper establishing model collapse risk from recursive synthetic training; KORMo's failure experiment (Figure 8) and diversity findings directly respond to this"
    448     },
    449     {
    450       "title": "Nemotron-CC: Transforming Common Crawl into a Refined Long-Horizon Pretraining Dataset",
    451       "relevance": "Major synthetic data source (1T tokens English) and methodological precedent for high-quality synthetic pretraining data at scale"
    452     },
    453     {
    454       "title": "The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale",
    455       "relevance": "Primary web corpus used for both English (UltraFineWeb) and Korean (FineWeb2) pretraining; quality filtering methodology is adapted for Korean"
    456     },
    457     {
    458       "title": "KMMLU: Measuring Massive Multitask Language Understanding in Korean",
    459       "relevance": "Core Korean evaluation benchmark suite used throughout to measure domain knowledge; analogous to MMLU for Korean"
    460     },
    461     {
    462       "title": "Tokenizer Choice for LLM Training: Negligible or Crucial?",
    463       "relevance": "Key reference for evaluating compression-vs-downstream-performance trade-offs; cited throughout Section 3 to interpret tokenizer experiment results"
    464     },
    465     {
    466       "title": "DataComp-LM: In Search of the Next Generation of Training Sets for Language Models",
    467       "relevance": "Primary English Stage 1 pretraining data source (DCLM, 1T tokens) and source of the heuristic filtering methodology adapted for Korean data"
    468     }
    469   ],
    470   "engagement_factors": {
    471     "practical_relevance": {
    472       "score": 2,
    473       "justification": "Full release of data, code, and training recipes provides direct replication value for Korean NLP practitioners and non-English FOM builders."
    474     },
    475     "surprise_contrarian": {
    476       "score": 2,
    477       "justification": "Directly challenges the widely-cited model collapse hypothesis by demonstrating synthetic data can serve as the primary pretraining resource when diverse."
    478     },
    479     "fear_safety": {
    480       "score": 0,
    481       "justification": "No AI safety or risk concerns raised; the paper is focused on linguistic performance and reproducibility."
    482     },
    483     "drama_conflict": {
    484       "score": 1,
    485       "justification": "The FOM vs OWM debate and challenge to synthetic data orthodoxy provide mild controversy, but the tone is constructive."
    486     },
    487     "demo_ability": {
    488       "score": 2,
    489       "justification": "Model is claimed publicly available at huggingface.co/kormo-lm, enabling immediate download and use."
    490     },
    491     "brand_recognition": {
    492       "score": 1,
    493       "justification": "KAIST is a reputable Korean research institution but not a major AI lab with global brand recognition in the LLM space."
    494     }
    495   },
    496   "hn_data": {
    497     "threads": [
    498       {
    499         "hn_id": "41890784",
    500         "title": "QUIC is not quick enough over fast internet",
    501         "points": 313,
    502         "comments": 280,
    503         "url": "https://news.ycombinator.com/item?id=41890784",
    504         "created_at": "2024-10-19T21:04:52Z"
    505       },
    506       {
    507         "hn_id": "45527191",
    508         "title": "Generalized Orders of Magnitude",
    509         "points": 45,
    510         "comments": 12,
    511         "url": "https://news.ycombinator.com/item?id=45527191",
    512         "created_at": "2025-10-09T13:08:26Z"
    513       },
    514       {
    515         "hn_id": "38515605",
    516         "title": "PixArt-α: Fast Training of Diffusion Transformer for Text-to-Image Synthetis",
    517         "points": 3,
    518         "comments": 1,
    519         "url": "https://news.ycombinator.com/item?id=38515605",
    520         "created_at": "2023-12-04T10:19:22Z"
    521       },
    522       {
    523         "hn_id": "45706862",
    524         "title": "SynthID-Image: Invisibly Watermarking AI-Generated Imagery",
    525         "points": 2,
    526         "comments": 0,
    527         "url": "https://news.ycombinator.com/item?id=45706862",
    528         "created_at": "2025-10-25T20:50:40Z"
    529       },
    530       {
    531         "hn_id": "42780000",
    532         "title": "Vision-Language Models Do Not Understand Negation",
    533         "points": 2,
    534         "comments": 0,
    535         "url": "https://news.ycombinator.com/item?id=42780000",
    536         "created_at": "2025-01-21T13:44:43Z"
    537       },
    538       {
    539         "hn_id": "37884774",
    540         "title": "Workload-aware and Learned Z-Indexes. (ArXiv:2310.04268v1 [cs.DB])",
    541         "points": 1,
    542         "comments": 0,
    543         "url": "https://news.ycombinator.com/item?id=37884774",
    544         "created_at": "2023-10-14T22:20:27Z"
    545       },
    546       {
    547         "hn_id": "33355436",
    548         "title": "Challenging Big-Bench Tasks and Whether Chain-of-Thought Can Solve Them",
    549         "points": 1,
    550         "comments": 1,
    551         "url": "https://news.ycombinator.com/item?id=33355436",
    552         "created_at": "2022-10-27T10:01:21Z"
    553       },
    554       {
    555         "hn_id": "35151086",
    556         "title": "Token Merging: Your ViT but Faster",
    557         "points": 1,
    558         "comments": 0,
    559         "url": "https://news.ycombinator.com/item?id=35151086",
    560         "created_at": "2023-03-14T13:25:09Z"
    561       }
    562     ],
    563     "top_points": 313,
    564     "total_points": 368,
    565     "total_comments": 294
    566   }
    567 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs