scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (21125B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "KorNAT: LLM Alignment Benchmark for Korean Social Values and Common Knowledge",
      6     "authors": [
      7       "Jiyoung Lee",
      8       "Minwoo Kim",
      9       "Seungho Kim",
     10       "Junghwan Kim",
     11       "Seunghyun Won"
     12     ],
     13     "year": 2024,
     14     "venue": "Annual Meeting of the Association for Computational Linguistics",
     15     "arxiv_id": "2402.13605",
     16     "doi": "10.48550/arXiv.2402.13605"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims are supported: 6,174 survey participants, 10K MCQ samples (4K/6K split), seven LLMs tested with few meeting reference scores, and government TTA approval are all documented in the paper body with corresponding evidence.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper makes causal-sounding claims (e.g., 'additional fine-tuning on Korean corpora enhances models' understanding of Korean social values' in Appendix E.2) based on observational comparisons of model scores without controlled experiments adequate for causal inference.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper explicitly bounds its claims to South Korea as of 2023; the limitations section and broader impacts section warn that 'nationally well-aligned model with one country may not be generalizable to another, especially those with significant cultural differences.'",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Alternative explanations are not systematically considered; for example, why USA cross-national prompting improves Korean alignment scores nearly as much as Korean prompting is unexplained, and HyperCLOVA X's superiority is attributed solely to Korean training data without consideration of other factors.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly acknowledges that MCQ format is 'not the best choice to evaluate the models' capabilities' and introduces three metric variants (SVA, A-SVA, N-SVA) to address the gap between measurement and the claimed construct of national alignment.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "A dedicated Limitations section appears before the Ethics Statement, covering temporal scope (Korea 2023), online survey demographic constraints, exclusion of universal values, and MCQ format limitations.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific quantified threats are provided: online survey resulted in 60+ age group comprising only 11.47% of respondents vs 19.96% of the Korean population; MCQ format limitations cited with reference to Röttger et al. 2024; temporal boundedness to 2023 noted explicitly.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Explicit scope boundaries state the dataset does not cover universally accepted social values and is limited to Korea 2023; the broader impacts section explicitly warns against applying results to other countries.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Funding is disclosed in acknowledgements: IITP grant (No.RS-2019-II190075, No.RS-2024-00338140) from the Korean government and an AI-Hub project funded by MSIT/NIA are named.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All author affiliations are disclosed on the title page: KAIST AI, DATUMO Inc., Seoul National University Bundang Hospital, and NAVER AI Lab.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Author Hwaran Lee is from NAVER AI Lab, and HyperCLOVA X (NAVER's proprietary LLM) achieves the highest performance on the benchmark; this is analogous to company employees evaluating their own product, and no conflict of interest is declared despite this structural bias.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or financial interests declaration is present; the NAVER AI Lab affiliation in relation to HyperCLOVA X being the top-performing evaluated model is not flagged as a potential financial interest.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are precisely defined: 'National Alignment,' 'social value alignment,' 'common knowledge alignment,' and all three metric variants (SVA, A-SVA, N-SVA) are formally defined with mathematical notation and intuitive explanations in Section 4.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Contributions are explicitly enumerated: first national alignment concept and metrics, KorNAT benchmark (10K samples via statistical sampling theory), and government TTA approval establishing dataset quality.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The related works section explicitly compares to Santurkar et al. 2023 (most similar), Wang et al. 2023, Durmus et al. 2023, and others, identifying specific limitations of each (small per-question response counts, non-country-specific questions) and explaining how KorNAT addresses them.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper argues why the benchmark measures national alignment: social value alignment uses population-representative survey responses as ground truth (because 'correct' answers reflect national consensus, not arbitrary labels), and common knowledge uses the compulsory GED curriculum as the canonical definition of national basic knowledge.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "Common knowledge questions are categorized as 'simple' (one fact) and 'complex' (two related facts) with separate results in Appendix Table 14, but no formal item-difficulty analysis with difficulty tiers is provided, and the social value dataset has no difficulty characterization whatsoever.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "The paper explicitly calculates the maximum achievable SVA score (0.450) and introduces A-SVA (max 0.626) and N-SVA as metric variants specifically to address the ceiling effect caused by natural variability in Korean social values across the five-point scale.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No human accuracy baseline is provided for the common knowledge dataset; the social value ground truth is derived from survey responses rather than being a performance measurement; the human evaluation in Section 5.2.3 measures preference between two LLM outputs, not human benchmark accuracy.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "SVA uses distribution-weighted scoring rather than majority vote, justified by citing Aroyo & Welty 2013 on crowd truth; A-SVA and N-SVA variants are motivated by the specific problem of SVA's low maximum achievable score; common knowledge uses GED 0.6 cut-off with explicit justification.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No contamination resistance measures are designed in; despite using recent 2023 Korean news and GED curriculum, no analysis of whether benchmark content appears in evaluated model training data is conducted, and no anti-gaming measures (canary strings, temporal holdouts, dynamic generation) are mentioned.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The limitations section briefly notes that 'regular updates to the dataset are necessary' given that social values change over time, but provides no concrete update methodology, timeline, or versioning strategy.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The MCQ format limitation is acknowledged and refrained/invalid responses are tabulated in Section 5.4, but there is no systematic discussion of how models could game the benchmark or what structural aspects of national alignment the benchmark fundamentally cannot capture.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Evaluation protocols and five prompt variants are described in Appendix D, but the full dataset was not released at publication (planned December 2024 on AI hub), with only small samples on HuggingFace, making independent reproduction of reported numbers impossible.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Appendix A provides a comprehensive 'Datasheet for Datasets' covering motivation, composition, collection process (with collection timeline: July–December 2023), preprocessing, uses, distribution, and maintenance following established documentation standards.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "The paper states the dataset will be released under MIT License on AI hub in December 2024, with sample questions currently on HuggingFace; the requirement to obtain author permission for current use is explicitly noted.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "The intended use (evaluating LLMs for Korean national deployment) is clear, but the datasheet explicitly marks 'Are there tasks for which the dataset should not be used?' as N/A, leaving a gap in guidance about inappropriate use cases or conclusions not warranted by the benchmark.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "KorNAT is the first benchmark measuring LLM national alignment with South Korea from both social value and common knowledge perspectives.",
    203       "evidence": "Extensive related works section surveys prior datasets and explicitly identifies their gaps (non-country-specific questions, too few responses per question, focus on bias rather than broad national opinion); no contradicting prior work is identified.",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "Most current LLMs are not sufficiently aligned with South Korea—only 2/7 models exceeded the social value reference and 3/7 exceeded the common knowledge reference score of 0.6.",
    208       "evidence": "Table 1 shows only PaLM-2 (A-SVA: 0.532) and HyperCLOVA X (N-SVA: 0.414) meet thresholds for social value; Table 3 shows HyperCLOVA X (0.707), PaLM-2 (0.664), Gemini Pro (0.639) meet the 0.6 knowledge threshold.",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "The SVA metric captures genuine alignment—models scoring higher on social value alignment are genuinely preferred by Korean participants, with PaLM-2 preferred over Llama-2 in 94/100 questions.",
    213       "evidence": "Human evaluation (Section 5.2.3) with 107 participants per question across 100 questions; Figure 3 shows preference ratios predominantly 0.7–0.95 favoring PaLM-2.",
    214       "supported": "strong"
    215     },
    216     {
    217       "claim": "Cross-national prompting ('How would someone from Korea respond?') generally improves social value alignment for most models.",
    218       "evidence": "Table 2 shows Korean CP improves A-SVA for GPT-3.5-Turbo (0.435→0.503), GPT-4 (0.448→0.528), HyperCLOVA X (0.318→0.505), Gemini Pro (0.513→0.505), but Claude-1 scores decrease.",
    219       "supported": "moderate"
    220     },
    221     {
    222       "claim": "Survey-based ground truth with 219 average responses per question achieves an error bound of ≤5.5% relative to true Korean population distribution.",
    223       "evidence": "Appendix B.6 provides mathematical derivation using Scheaffer et al. 2011 sampling theory; minimum 198 responses (5.7% error bound) and maximum 243 responses (5.2% error bound) are documented.",
    224       "supported": "strong"
    225     },
    226     {
    227       "claim": "HyperCLOVA X's superior performance on Korean common knowledge (0.707 total vs next best 0.664) is due to Korean-specific pretraining.",
    228       "evidence": "The paper labels this a hypothesis: 'We hypothesize that this superior understanding stems from an enhanced capability in linguistically processing Korean'; no ablation or controlled analysis is provided.",
    229       "supported": "weak"
    230     }
    231   ],
    232   "methodology_tags": [
    233     "benchmark-eval",
    234     "observational"
    235   ],
    236   "key_findings": "KorNAT introduces a 10K-question benchmark measuring LLM alignment with South Korea: 4K social value questions with population-representative survey ground truth (6,174 participants, average 219 responses per question) and 6K common knowledge questions from GED curriculum. Tested on seven frontier LLMs, only 2–3 meet reference thresholds, indicating most LLMs—including GPT-4 and Claude—are insufficiently aligned with Korean culture and knowledge. HyperCLOVA X, trained extensively on Korean data, dramatically outperforms others on common knowledge (0.707 vs 0.664 next best) but PaLM-2 leads on SVA social value alignment. Human evaluation validates the SVA metric: higher-scoring models (PaLM-2) are genuinely preferred over lower-scoring ones (Llama-2) by Korean participants in 94/100 questions with preference ratios of 0.7–0.95.",
    237   "red_flags": [
    238     {
    239       "flag": "NAVER author evaluates NAVER model without conflict disclosure",
    240       "detail": "Author Hwaran Lee is from NAVER AI Lab, and HyperCLOVA X (NAVER's proprietary LLM) achieves the highest benchmark performance; no competing interests statement is present despite this structural conflict."
    241     },
    242     {
    243       "flag": "No contamination analysis",
    244       "detail": "No assessment of whether benchmark questions (Korean 2022–2023 news articles, GED curriculum) appear in evaluated model training corpora; especially concerning for HyperCLOVA X trained on large Korean web data."
    245     },
    246     {
    247       "flag": "Full dataset withheld at publication",
    248       "detail": "Only sample questions available on HuggingFace at publication time; full dataset release planned for December 2024 on AI hub, making independent reproduction of all reported numbers impossible."
    249     },
    250     {
    251       "flag": "Extreme demographic adjustment weights",
    252       "detail": "Education level adjustment weights reach 22.74× for elementary school graduates (Appendix B.9), indicating severe underrepresentation that statistical weighting may inadequately correct given small cell sizes."
    253     },
    254     {
    255       "flag": "USA cross-national prompting nearly matches Korean prompting",
    256       "detail": "Adding 'How would someone from USA respond?' improves Korean alignment scores nearly as much as Korean prompting (e.g., GPT-3.5-Turbo SVA: 0.290 base, 0.334 Korean CP, 0.324 USA CP), suggesting the benefit may be from prompt specificity rather than cultural grounding—an alternative explanation not discussed."
    257     }
    258   ],
    259   "cited_papers": [
    260     {
    261       "title": "Whose opinions do language models reflect?",
    262       "relevance": "Most directly comparable prior work (Santurkar et al. 2023); KorNAT explicitly differentiates by focusing on one country with statistically grounded larger per-question survey samples."
    263     },
    264     {
    265       "title": "Towards measuring the representation of subjective global opinions in language models",
    266       "relevance": "Cross-national alignment work using global surveys (Durmus et al. 2023); KorNAT critiques it for non-country-specific questions and small per-question response counts (10–20)."
    267     },
    268     {
    269       "title": "Not all countries celebrate Thanksgiving: On the cultural dominance in large language models",
    270       "relevance": "Documents English-speaking cultural bias in LLMs (Wang et al. 2023); motivates the need for country-specific benchmarks like KorNAT."
    271     },
    272     {
    273       "title": "Measuring massive multitask language understanding",
    274       "relevance": "MMLU (Hendrycks et al. 2021) is the knowledge benchmark most comparable to KorNAT's common knowledge component; KorNAT differentiates by using Korean GED curriculum rather than translated English content."
    275     },
    276     {
    277       "title": "Political compass or spinning arrow? Towards more meaningful evaluations for values and opinions in large language models",
    278       "relevance": "Critiques MCQ-based value evaluation in LLMs (Röttger et al. 2024); cited in KorNAT's limitations section as the basis for acknowledging MCQ format constraints."
    279     },
    280     {
    281       "title": "SQuARe: A large-scale dataset of sensitive questions and acceptable responses created through human-machine collaboration",
    282       "relevance": "Prior Korean LLM alignment dataset (Lee et al. 2023a); KorNAT differentiates by covering broader national opinion topics with substantially more responses per question."
    283     },
    284     {
    285       "title": "Dealing with disagreements: Looking beyond the majority vote in subjective annotations",
    286       "relevance": "Foundational work on handling annotation disagreement (Davani et al. 2022); directly motivates KorNAT's distribution-based SVA metric rather than majority vote ground truth."
    287     },
    288     {
    289       "title": "NLPositionality: Characterizing design biases of datasets and models",
    290       "relevance": "Related work on cross-cultural biases in NLP datasets and models (Santy et al. 2023); KorNAT critiques it for sub-sampling questions not reflecting country-specific characteristics."
    291     }
    292   ],
    293   "engagement_factors": {
    294     "practical_relevance": {
    295       "score": 3,
    296       "justification": "Directly actionable for companies deploying LLMs in Korea; methodology explicitly described as generalizable to other countries and time periods."
    297     },
    298     "surprise_contrarian": {
    299       "score": 2,
    300       "justification": "GPT-4 failing Korean alignment despite general capability is noteworthy; USA cross-national prompting improving Korean scores nearly as much as Korean prompting is counterintuitive."
    301     },
    302     "fear_safety": {
    303       "score": 1,
    304       "justification": "Raises concern about culturally misaligned AI deployed in non-English-speaking countries, but framed constructively as a benchmark contribution rather than alarmingly."
    305     },
    306     "drama_conflict": {
    307       "score": 1,
    308       "justification": "No significant controversy; underperformance of non-Korean-trained models is expected, and results are presented neutrally."
    309     },
    310     "demo_ability": {
    311       "score": 2,
    312       "justification": "Sample questions available on HuggingFace; public leaderboard planned for June 2024, enabling hands-on evaluation by the research community."
    313     },
    314     "brand_recognition": {
    315       "score": 2,
    316       "justification": "Evaluates GPT-4, Claude, Gemini Pro, and PaLM-2; KAIST and NAVER AI Lab are well-known Korean AI institutions; government TTA approval adds institutional credibility."
    317     }
    318   },
    319   "hn_data": {
    320     "threads": [],
    321     "top_points": 0,
    322     "total_points": 0,
    323     "total_comments": 0
    324   }
    325 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs