ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (34265B)


      1 {
      2   "paper": {
      3     "title": "KorNAT: LLM Alignment Benchmark for Korean Social Values and Common Knowledge",
      4     "authors": [
      5       "Jiyoung Lee",
      6       "Minwoo Kim",
      7       "Seungho Kim",
      8       "Junghwan Kim",
      9       "Seunghyun Won",
     10       "Hwaran Lee",
     11       "Edward Choi"
     12     ],
     13     "year": 2024,
     14     "venue": "Annual Meeting of the Association for Computational Linguistics",
     15     "arxiv_id": "2402.13605",
     16     "doi": "10.48550/arXiv.2402.13605"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "KorNAT is the first benchmark for measuring national alignment of LLMs with South Korea, comprising 4K social value questions (grounded in a 6,174-person survey) and 6K common knowledge questions (from Korean GED curriculum). Testing seven LLMs, only PaLM-2 and HyperCLOVA X exceeded reference scores in social value and common knowledge respectively, indicating most LLMs are insufficiently aligned with Korean values and knowledge. HyperCLOVA X (trained extensively on Korean data) dramatically outperformed others on Korean-specific subjects, while all models struggled with Mathematics and Science. A human evaluation validated that the proposed alignment metrics correlate with actual Korean population preferences.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No source code repository (e.g., GitHub link) is provided. The paper only links to a HuggingFace dataset page with samples and evaluation protocols."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "Only 'small samples from the dataset are publicly available now' at the HuggingFace link. The full dataset was scheduled for release on AI Hub in December 2024. At time of publication, the full benchmark is not available for independent evaluation."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Appendix D.1 specifies '2 NVIDIA A100 GPUs. We used CUDA 12.0, Pytorch 2.0.1, and huggingface 4.31.0.' This provides sufficient detail for environment recreation."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The paper describes the methodology but does not provide executable instructions for reproducing the evaluation pipeline."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "All results tables (Tables 1, 2, 3) report ± standard deviation across five prompt variations. For example, 'PaLM-2: 0.331±0.007' in SVA Final Adjustment."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No statistical significance tests are performed. Comparisons between models are based solely on comparing point estimates and standard deviations. Claims like 'PaLM-2 shows the highest social value alignment' are made without formal testing."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Absolute alignment scores are reported alongside reference baselines (Best Score, All-Neutral), allowing computation of effect magnitudes. For example, PaLM-2's A-SVA of 0.532 vs. All-Neutral of 0.190 and Best of 0.626 provides full context."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Appendix B.6 provides a rigorous sampling theory justification: with 219 average responses per question, they achieve an error bound of <5.5% at 95% confidence. The derivation from Scheaffer et al. (2011) is fully worked out."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Standard deviations across five different prompts are reported for all experiments. For example, Table 1 shows '0.331±0.007' for PaLM-2 SVA, and Table 3 shows per-subject standard deviations."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Table 1 includes 'Best Score' (theoretical maximum achievable alignment) and 'All-Neutral' (naive baseline answering Neutral for all questions). Seven LLMs are compared against each other and these reference points."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The seven models tested (GPT-4, GPT-3.5-Turbo, Claude-1, PaLM-2, Gemini Pro, Llama-2 70B, HyperCLOVA X) were contemporary state-of-the-art models at the time of the study (2023-2024)."
     82       },
     83       "ablation_study": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "The paper proposes a benchmark instrument, not a system with removable components. The three response adjustment scenarios (No Adjustment, Age & Gender, Final) serve as variations of ground truth construction, not ablations of a method."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Three social value alignment metrics (SVA, A-SVA, N-SVA) and accuracy per subject for common knowledge. Additionally, refrained/invalid response counts are reported."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Section 5.2.3 describes a human evaluation with 107 participants comparing PaLM-2 and Llama-2 outputs. Participants selected which model's reasoning better aligned with their opinions across 100 questions. PaLM-2 was preferred in 94/100 questions."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "All models are evaluated zero-shot on KorNAT with no fine-tuning or tuning on any portion of the dataset. The entire benchmark serves as a held-out test set. Results are averaged across 5 prompts rather than cherry-picking."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Table 3 provides per-subject breakdowns across 7 subjects for common knowledge. Table 1 shows three metrics under three adjustment conditions. Table 11 shows per-gender and per-age group breakdowns. Table 14 separates simple vs. complex questions."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 5.4 and Table 4 analyze omitted responses (refrained and invalid), with examples in Appendix D.3. They discuss GPT-4's 13.93% refrained rate and Claude-1's 11.99% refrained rate on social value questions, and Gemini Pro's 'A/B/C/D' invalid response pattern."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper's central finding is essentially negative: most models fail to meet reference scores. Mathematics average was only 0.333 across models. Claude-1's alignment decreased with Cross-national Prompting (Table 2). Most models perform worse on Korean-specific subjects than English."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract claims 'only a few models met our reference score' — confirmed in Tables 1 and 3 (2/7 for social value, 3/7 for common knowledge). Claims about dataset size (4K+6K=10K), survey size (6,174 participants), and government approval are all substantiated in the paper body."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper makes implicit causal claims, e.g., 'This suggests that the structure of prompts influences the social value alignment of LLMs' (Section 5.2.2) and 'models specifically trained for the Korean context are particularly effective at capturing Korean common knowledge' (Section 5.3). These are not backed by controlled causal designs — confounds like model size, training data volume, and architecture differences are not isolated."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper is explicitly scoped to South Korea: 'Our dataset primarily focuses on Korea in 2023' (Limitations). The title specifies 'Korean Social Values and Common Knowledge.' They note that 'nationally well-aligned model with one country may not be generalizable to another' (Broader Impacts)."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper offers single interpretations for most findings. For HyperCLOVA X's superiority, they 'hypothesize' it stems from Korean linguistic capability and training data (Section 5.3) but don't seriously consider alternatives (model size, architecture, instruction tuning quality). The cross-national prompting results showing USA CP also improves alignment are noted but not explored for confounds."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper clearly defines 'national alignment' as a composite of social value alignment (agreement with survey distribution) and common knowledge alignment (accuracy on curriculum-based questions). The metrics directly measure what is claimed — there is no gap between measurement and framing."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Appendix D.1 specifies some models ('claude-instant-1.2', 'hcx-002'/'hcx-003', 'PaLM 2 Text Bison', 'gpt-4-1106-preview' for post-processing) but not all. GPT-3.5-Turbo lacks a snapshot version, 'instance version of gpt-4' is ambiguous, and Gemini Pro has no version identifier. Inconsistent specification."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Full prompt templates with actual text are provided in Table 10 and Appendix D.2, including all five semantically similar prompt variations in both Korean and English translation. The fill values (question, answer candidates) come directly from the dataset."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No temperature, top-p, max tokens, or other sampling parameters are reported for the evaluation experiments. For question generation with GPT-3.5-Turbo, they mention 'default decoding strategy' but do not state what the defaults were."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. Models are prompted directly with instruction, question, and choices in a zero-shot manner."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Extensive documentation of the data pipeline: keyword extraction (1,644 keywords), question generation and two rounds of human revision (Section 3.1.2), survey filtering via distractor and consistency checks (Section 3.1.3), response adjustment formulas (Section 3.1.4, Eqs. 1-3), and post-processing of model outputs (Appendix D.3)."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "A dedicated 'Limitations' section discusses temporal scope (Korea 2023), lack of universal values, demographic imbalances in survey recruitment, and MCQ format limitations."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Specific threats are discussed: the dataset focuses on Korea in 2023 and needs updating as values change; unable to recruit exactly equal demographic representation (smallest subgroup: 708 aged 60+); MCQ format doesn't capture free-form generation capabilities (citing Röttger et al., 2024)."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The paper explicitly states what it does NOT cover: 'we did not address universally accepted social values' and 'we could not cover thorough free-form generation evaluation.' In Broader Impacts: 'nationally well-aligned model with one country may not be generalizable to another, especially those with significant cultural differences.'"
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "Only 'small samples from the dataset are publicly available now' at HuggingFace. The full dataset and raw survey responses are not available for independent verification at time of publication."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Data collection is described in exceptional detail: Sections 3.1-3.2 and extensive appendices cover keyword extraction sources, question generation prompts, revision guidelines, survey interface (Figure 4), participant payment, filtering criteria, and textbook sources (Table 7)."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Survey participants: 6,174 Korean citizens over age 19, recruited via an online survey platform with stratification by age and gender groups. Workers: 34 college graduates for first revision round, 7 selected for diligence for second round, 21 for knowledge dataset (with specific qualification criteria). Demographics in Table 5."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Full pipeline documented: 1,644 keywords → GPT-3.5-Turbo question generation → 2 rounds of human revision by 34 then 7 workers → survey of 6,174 participants → filtering via distractors/consistency → response adjustment (stratification + sampling + education/residence/income). Figure 2 provides a visual overview."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Acknowledgements section discloses IITP grants (No.RS-2019-II190075, No.RS-2024-00338140) funded by the Korean government (MSIT), and the Open AI Dataset Project (AI-Hub) hosted by MSIT and NIA."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "All affiliations are listed: KAIST AI, DATUMO Inc., Seoul National University Bundang Hospital, and NAVER AI Lab. Hwaran Lee's NAVER affiliation is disclosed, and NAVER's HyperCLOVA X is one of the evaluated models."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Funding comes from the Korean government (IITP/MSIT/NIA), which has no direct financial interest in the performance of any particular model. The government's interest is in measuring LLM alignment with Korea, not promoting a specific model."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is present. Hwaran Lee is from NAVER AI Lab, whose product HyperCLOVA X is evaluated and performs best on common knowledge. Authors from DATUMO Inc. (a data company) are also co-authors. These potential conflicts are not explicitly declared."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No training data cutoff dates are stated for any of the seven models tested. This is important because the social value questions were generated from news articles within the last 12 months (Aug 2022-Jul 2023), and some models may have trained on similar content."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of whether any model's training data overlaps with the benchmark content. The questions were generated using GPT-3.5-Turbo from public news articles, and the common knowledge questions are based on published textbooks — both could appear in model training data."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No contamination analysis is performed. While KorNAT is a newly created benchmark (reducing direct leakage risk), the source materials (Korean news articles, textbooks, GED reference materials) are publicly available and could be in model training data. This is not discussed."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "No pre-registration is mentioned. The study design, survey questions, and analysis plan were not registered in advance on any pre-registration platform."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": true,
    259         "answer": true,
    260         "justification": "'This study has been approved by KAIST IRB (KH2024-020)' is stated in the Ethics Statement section."
    261       },
    262       "demographics_reported": {
    263         "applies": true,
    264         "answer": true,
    265         "justification": "Extensive demographics in Table 5: breakdown by age × gender (10 groups), job (22 NACE categories), domestic area (7 regions), sexual orientation, education level (6 levels), annual income (10 brackets), religion (5 categories), and disability status."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": true,
    269         "answer": true,
    270         "justification": "Survey participants: Korean citizens over age 19, recruited with stratification by age and gender. Workers for question revision: 'all college graduates or above.' Workers for complex knowledge questions had stricter criteria: 'top 4% in Korean SAT, experience in education, or holding a college degree or higher in the relevant subject.'"
    271       },
    272       "randomization_described": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "For the human evaluation (Section 5.2.3), participants compared pairs of model outputs, but the paper does not describe how the order of presentation (Answer A vs. Answer B) was randomized or counterbalanced. For the main survey, the randomization of question presentation is not described beyond distractor/consistency questions appearing 'randomly with a probability of 10%.'"
    276       },
    277       "blinding_described": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "The human evaluation (Section 5.2.3) presents model outputs as 'Answer A' and 'Answer B' (Figure 7), which suggests blinding, but the paper never explicitly states that evaluators were blinded to model identity."
    281       },
    282       "attrition_reported": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "The paper reports 6,174 final participants and describes filtering criteria (distractor questions, consistency checks), but does not report how many participants were initially recruited vs. how many were excluded by these quality controls. The pre-filtering count is never stated."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No inference costs, API costs, or evaluation latency are reported despite testing seven models including multiple API-based models (GPT-4, GPT-3.5-Turbo, Claude-1, PaLM-2, Gemini Pro) on 10K questions each across 5 prompts."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "Hardware is mentioned ('2 NVIDIA A100 GPUs' in Appendix D.1) but no total compute time, API spend, or wall-clock time for the full evaluation is stated."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Results are reported across 5 prompt variations, which is a form of sensitivity analysis, but random seed sensitivity is not explicitly tested. For API models with stochastic outputs, seed or temperature sensitivity is not addressed."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "'We conducted experiments using five distinct yet semantically similar prompts' (Section 5.1). All results are reported as averages and standard deviations across these 5 runs."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No hyperparameter search budget is reported. The five prompts appear hand-crafted, but the selection process and number of candidate prompts considered is not disclosed."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "The paper reports averages across all 5 prompts rather than cherry-picking the best-performing prompt. This is a principled approach that avoids selection bias."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "Many comparisons are made (7 models × 3 metrics × 3 adjustments for social values; 7 models × 7 subjects for knowledge) without any multiple comparison correction."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "Co-author Hwaran Lee is from NAVER AI Lab, and NAVER's HyperCLOVA X is evaluated in the benchmark. The Acknowledgements thank multiple Korean AI companies including NAVER for 'discussions regarding the benchmark planning process.' The potential bias of evaluating an affiliated product is not acknowledged."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "Models with vastly different compute requirements (e.g., Llama-2-70B vs. GPT-4 vs. HyperCLOVA X) are compared without any discussion of compute budget normalization. Model sizes and training compute are not controlled for."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": true,
    339         "justification": "Section 5.2.3 conducts a human evaluation validating that the proposed SVA metric correlates with actual Korean population preferences (PaLM-2 preferred in 94/100 questions over Llama-2). The Limitations section discusses MCQ format limitations. The survey-based ground truth is grounded in sampling theory."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is used. Models are evaluated via direct zero-shot prompting."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "Not discussed. The social value questions are based on news articles from Aug 2022–Jul 2023, and common knowledge questions from published textbooks. Models trained after these dates may have seen the source materials, but this temporal relationship is never analyzed."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "Not discussed. The MCQ format with answer options could provide structural cues (e.g., longer correct answers, specific patterns in distractor design) that leak information about the correct answer."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "Not discussed. Social value questions were generated from public news articles, and common knowledge questions from published textbooks — both widely available in pre-training corpora."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No leakage detection method (canary strings, membership inference, n-gram overlap analysis) is applied. The paper relies implicitly on the benchmark being newly created, without formal verification."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Most current LLMs are not sufficiently aligned with South Korea in either social values or common knowledge.",
    373       "evidence": "Table 1 shows only PaLM-2 exceeded reference performance in SVA and A-SVA. Table 3 shows only HyperCLOVA X, PaLM-2, and Gemini Pro surpassed the 0.6 reference score for common knowledge, with HyperCLOVA X exceeding it by only 0.107.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Models extensively trained on Korean data (HyperCLOVA X) are particularly effective at capturing Korean common knowledge.",
    378       "evidence": "Table 3 shows HyperCLOVA X achieves 0.707 total score, outperforming all others across most subjects (e.g., 0.783 in Korean vs. next-best 0.652), with analysis of unique correct answers in Appendix D.6 Table 13 showing questions requiring Korean cultural understanding.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "The proposed alignment metrics (SVA, A-SVA, N-SVA) effectively reflect social values as judged by Korean citizens.",
    383       "evidence": "Section 5.2.3 shows PaLM-2 (highest A-SVA) was preferred by human evaluators over Llama-2 (lowest A-SVA) in 94 out of 100 questions, with preference ratios predominantly 0.7–0.95 (Figure 3, 107 evaluators).",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Cross-national prompting with Korean context generally improves social value alignment.",
    388       "evidence": "Table 2 shows Korean CP improves SVA and A-SVA for GPT-3.5-Turbo, GPT-4, HyperCLOVA X, and Gemini Pro compared to baseline. However, Claude-1's alignment decreases with Korean CP (SVA drops from 0.286 to 0.227).",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "HyperCLOVA X tends to avoid engaging in topics with divided opinions (highest N-SVA, preference for Neutral).",
    393       "evidence": "Table 1 shows HyperCLOVA X achieves best N-SVA (0.414) while scoring lower in SVA (0.253) and A-SVA (0.318). Table 4 shows 59 refrained responses. The paper notes it is 'the only model to outperform All-Neutral' in N-SVA.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Korean fine-tuned small LMs tend to outperform their multilingual base models on national alignment.",
    398       "evidence": "Tables 17 and 18 (Appendix E) show Korean fine-tuned models (KoAlpaca, KoVicuna, KoAlpaca-Polyglot, KULLM-Polyglot, KoLlama-2) consistently outperform multilingual counterparts in both SVA and common knowledge. KoLlama-2 achieves highest total knowledge score (0.324) among small models.",
    399       "supported": "moderate"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "Undisclosed conflict of interest",
    405       "detail": "Co-author Hwaran Lee is from NAVER AI Lab. NAVER's HyperCLOVA X is evaluated in the benchmark and achieves the best common knowledge score (0.707). The Acknowledgements thank NAVER and other Korean companies for 'discussions regarding the benchmark planning process,' but no conflict of interest statement appears in the paper."
    406     },
    407     {
    408       "flag": "Dataset not available for verification",
    409       "detail": "Only small samples are publicly available at HuggingFace. The full dataset was scheduled for release December 2024 on AI Hub. Independent verification and replication of results is impossible without the full benchmark. The survey ground truth distributions, which are the core contribution, are not available."
    410     },
    411     {
    412       "flag": "No contamination analysis despite public source materials",
    413       "detail": "Social value questions were generated from publicly available Korean news articles, and common knowledge questions from published textbooks and GED materials. These sources are likely present in the training data of tested models, yet no contamination analysis is performed. The strong performance of HyperCLOVA X (trained extensively on Korean content) could partly reflect memorization rather than genuine understanding."
    414     },
    415     {
    416       "flag": "GPT-4 as judge introduces potential bias",
    417       "detail": "GPT-4 (gpt-4-1106-preview) is used to map non-conforming model responses to answer choices (Appendix D.3). This introduces potential systematic bias: GPT-4 may interpret responses from certain models differently. GPT-4 itself is also one of the evaluated models, creating a circularity concern."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Training language models to follow instructions with human feedback",
    423       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    424       "year": 2022,
    425       "relevance": "Foundational work on RLHF alignment that KorNAT's social value alignment concept extends to national/cultural contexts."
    426     },
    427     {
    428       "title": "GPT-4 technical report",
    429       "authors": ["OpenAI"],
    430       "year": 2023,
    431       "arxiv_id": "2303.08774",
    432       "relevance": "GPT-4 is one of the seven LLMs evaluated in the benchmark and is used as a judge for post-processing responses."
    433     },
    434     {
    435       "title": "Language models are few-shot learners",
    436       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    437       "year": 2020,
    438       "relevance": "Foundational GPT-3 paper motivating research on LLM alignment and capability evaluation."
    439     },
    440     {
    441       "title": "Llama 2: Open foundation and fine-tuned chat models",
    442       "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"],
    443       "year": 2023,
    444       "arxiv_id": "2307.09288",
    445       "relevance": "Llama-2 is evaluated in KorNAT and serves as the least-aligned model in the human evaluation comparison."
    446     },
    447     {
    448       "title": "HyperCLOVA X technical report",
    449       "authors": ["Kang Min Yoo", "Jaegeun Han"],
    450       "year": 2024,
    451       "arxiv_id": "2404.01954",
    452       "relevance": "HyperCLOVA X (Korean-specialized LLM from NAVER) achieves the best common knowledge alignment in the benchmark evaluation."
    453     },
    454     {
    455       "title": "Whose opinions do language models reflect?",
    456       "authors": ["Shibani Santurkar", "Esin Durmus", "Faisal Ladhak"],
    457       "year": 2023,
    458       "relevance": "Closely related work on measuring whose values LLMs reflect; KorNAT extends this to Korean-specific national alignment."
    459     },
    460     {
    461       "title": "Towards measuring the representation of subjective global opinions in language models",
    462       "authors": ["Esin Durmus", "Karina Nyugen"],
    463       "year": 2023,
    464       "arxiv_id": "2306.16388",
    465       "relevance": "Measures global opinion representation in LLMs; KorNAT's cross-national prompting builds on their method."
    466     },
    467     {
    468       "title": "Measuring massive multitask language understanding",
    469       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    470       "year": 2021,
    471       "relevance": "MMLU benchmark for LLM knowledge evaluation; KorNAT's common knowledge dataset is compared against MMLU Korean in Appendix C.4."
    472     },
    473     {
    474       "title": "TruthfulQA: Measuring how models mimic human falsehoods",
    475       "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"],
    476       "year": 2022,
    477       "relevance": "Benchmark for measuring LLM truthfulness; related to KorNAT's approach of testing factual knowledge alignment."
    478     },
    479     {
    480       "title": "Aligning AI with shared human values",
    481       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    482       "year": 2020,
    483       "relevance": "Foundational dataset for measuring LLM alignment with human values; KorNAT extends this framework to nation-specific values."
    484     },
    485     {
    486       "title": "A general language assistant as a laboratory for alignment",
    487       "authors": ["Amanda Askell", "Yuntao Bai", "Anna Chen"],
    488       "year": 2021,
    489       "arxiv_id": "2112.00861",
    490       "relevance": "Early work on LLM alignment methods; KorNAT provides a benchmark for evaluating alignment outcomes in Korean contexts."
    491     },
    492     {
    493       "title": "Political compass or spinning arrow? Towards more meaningful evaluations for values and opinions in large language models",
    494       "authors": ["Paul Röttger", "Valentin Hofmann"],
    495       "year": 2024,
    496       "arxiv_id": "2402.16786",
    497       "relevance": "Critiques MCQ-based value evaluation of LLMs; cited in KorNAT's Limitations as motivation for acknowledging MCQ format constraints."
    498     }
    499   ],
    500   "engagement_factors": {
    501     "practical_relevance": {
    502       "score": 1,
    503       "justification": "Useful for researchers building Korean-targeted LLMs but not immediately actionable as a tool or technique for general practitioners."
    504     },
    505     "surprise_contrarian": {
    506       "score": 1,
    507       "justification": "Finding that most LLMs are poorly aligned with non-English cultures is expected; the Korean-specific detail adds nuance but doesn't challenge conventional wisdom."
    508     },
    509     "fear_safety": {
    510       "score": 0,
    511       "justification": "No AI safety or security concerns are raised; the paper focuses on cultural alignment measurement, not adversarial risks."
    512     },
    513     "drama_conflict": {
    514       "score": 0,
    515       "justification": "No controversy or provocative claims; straightforward benchmark construction and evaluation."
    516     },
    517     "demo_ability": {
    518       "score": 1,
    519       "justification": "HuggingFace link with sample data exists, but the full benchmark is not publicly available for immediate use."
    520     },
    521     "brand_recognition": {
    522       "score": 1,
    523       "justification": "Evaluates well-known models (GPT-4, PaLM-2, Gemini) but the paper itself is from Korean academic/industry labs without global brand recognition."
    524     }
    525   }
    526 }

Impressum · Datenschutz