scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31691B)
      1 {
      2   "paper": {
      3     "title": "An Evaluation of Cultural Value Alignment in LLM",
      4     "authors": [
      5       "Nicholas Sukiennik",
      6       "Chen Gao",
      7       "Fengli Xu",
      8       "Yong Li"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2504.08863",
     13     "doi": "10.48550/arXiv.2504.08863"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "Across 10 LLMs and 20 countries, all models converge toward a moderate global average culture rather than representing individual countries' cultural values accurately. The United States is by far the best-aligned country (deviation ratio 1.99 vs. next-best 1.13), likely due to dominant US-origin web training data. GLM-4, despite having only 9 billion parameters, achieves the best cultural alignment among all models tested. Both Chinese-origin and US-origin models align better with US culture than with Chinese culture, and a strong correlation (r=0.94) exists between a country's share of web content and how well models align to its culture.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No source code, repository URL, or code archive is mentioned anywhere in the paper."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The evaluation instrument (Hofstede's Values Survey Module) is a publicly available questionnaire, and the ground truth cultural dimension scores are publicly available from the Hofstede official website (geerthofstede.com) and a referenced third-party consultancy. The authors did not release their collected LLM response data, but the benchmark inputs and ground truths are standard public resources."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No environment specifications, dependency files, or library versions are mentioned in the paper."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions, README, or reproduction scripts are provided."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Results are reported as point estimates (deviation ratios, average absolute differences) without confidence intervals or error bars on any figure or table."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper reports Pearson correlation coefficients (r=0.54, r=0.94, r=0.81, r=0.20, r=0.13) but no significance tests (no p-values, t-tests, or bootstrap tests) are reported for any comparative claims between models or countries."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Pearson correlation coefficients (r=0.54, 0.94, 0.81, 0.20, 0.13) are reported for all external-factor analyses in Section 4.3/Figure 6. Deviation ratios (Table 3) and absolute differences (Figure 3) provide magnitude context for country and model comparisons."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is given for why 20 countries, 10 models, or 3 trials per prompt were chosen. No power analysis is provided."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper states 'each country-language prompt was called three times and averaged' (Section 3) but no standard deviation, variance, or spread measure across the three runs is reported anywhere."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Ten LLMs are compared against each other and against Hofstede ground truth cultural dimension scores. A 'global average' culture baseline is also computed and used as the reference point for the deviation ratio metric (Equation 1)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The model set includes recent models (GPT-4o, DeepSeek-v2.5, Qwen-2.5 series) alongside slightly older but still relevant ones (GPT-3.5-Turbo, GPT-4), representing a contemporary cross-section of LLMs."
     79       },
     80       "ablation_study": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "The experimental setup has only one component — prompting an LLM with a questionnaire in a cultural role. There are no system components to ablate."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Two evaluation metrics are used and compared: average absolute difference from ground truth (Figure 3a) and the proposed deviation ratio (Figure 3b, Equation 1). The paper explicitly discusses how the two metrics produce different model rankings."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No human evaluation of LLM outputs is conducted. Ground truth comes from pre-existing aggregated human survey data (Hofstede scores), not from humans evaluating the LLM responses produced in this study."
     94       },
     95       "held_out_test_set": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "The study prompts LLMs with a fixed questionnaire and compares to known ground truths. No model training or tuning is performed, so there is no concept of a dev/test split."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down by all six cultural dimensions (POW, IND, MASC, UAV, LTO, IVR) in Figure 7, by all 20 countries in Table 3 and Figure 2, by all 10 models in Figure 3, and by model origin in Figures 4-5."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 4 discusses countries with poor alignment (Bangladesh, Turkey, Portugal at deviation ratios 0.59-0.62), dimensions that are harder to align (MASC, IND, IVR in Figure 7), and the general failure of all models to represent non-US cultures accurately."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Several negative findings are reported: model size does not strongly predict alignment (Pearson r=0.13, Figure 6a), China-origin models fail to align well with Chinese culture despite their origin (Section 4.2), and most countries show poor alignment overall."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract's key claims are supported: US is best-aligned country (Table 3, deviation ratio 1.99), GLM-4 has best alignment ability (Figure 3b), models converge on a global average (Figure 1), and models align better with US than China regardless of origin (Figures 4-5)."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper uses causal language ('the influence of model origin and language on cultural alignment,' 'factors that may be influencing these results,' 'could be in large part explained by the amount of training data') from purely observational/correlational data. No causal identification strategy (RCT, instrumental variables, etc.) is employed. Confounds between model origin, training data composition, and model architecture are not addressed."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title 'An Evaluation of Cultural Value Alignment in LLM' and conclusions about 'the overall state of cultural alignment of LLMs' generalize beyond the 10 models, 20 countries, and single instrument tested. The paper does not bound its conclusions to these specific models and countries in its main claims."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 4.3 systematically examines multiple alternative explanations for alignment differences: web content proportion (r=0.94), GDP (r=0.81), digital population (r=0.20), and model size (r=0.13). The relative strengths of these factors are discussed."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper measures LLM responses to a 24-item structured questionnaire and frames this as 'cultural value alignment.' It does not discuss the gap between survey-question responses and actual behavioral cultural alignment in downstream applications. The VSM is called 'the gold standard' without questioning whether LLM responses to forced-choice questions capture the same construct as human cultural values."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Table 2 lists models as 'GPT-3.5-Turbo,' 'GPT-4,' 'GPT-4o,' 'Gemini-1.5,' 'LLaMA-3' without specific version snapshots or API dates. While Qwen variants (e.g., 'Qwen-2.5-7B-Instruct') and 'Deepseek-v2.5' are more specific, the majority of models lack version identifiers needed for reproduction."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Table 1 provides the full prompting mechanism including the system role ('Your role is an average person from {country}'), the question format with response options, and the additional instruction to 'make only one choice and always include a numerical value.' All 20 country/language fill values are in Table A, and the 24 questions come from the publicly referenced Hofstede VSM questionnaire."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 3 states 'The models were called using a temperature of zero as to reduce deterministic outputs and increase reproducibility.' With temperature=0 (greedy decoding), other sampling parameters are effectively irrelevant."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. Models are prompted directly with questionnaire items via their APIs."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The pipeline is documented: prompts are constructed per Table 1, 'only the numerical response was extracted,' dimension scores are calculated via Equation 2 with specified hyperparameters, values are normalized to 0-100 scale (Appendix A.1), and results are averaged over 3 runs."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "There is no dedicated limitations section. Two sentences at the end of Section 6 (Conclusions) mention limitations: 'Some limitations of our study include the countries and languages chosen' and the one-language-per-country issue. This does not constitute substantive discussion."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The brief limitations mention in the conclusion names only two specific issues (one language per country, same-language countries not tested) without deeper analysis of threats such as contamination of Hofstede data in training sets, construct validity of using survey questions on LLMs, or potential confounds in the model-origin analysis."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The paper explicitly states it did not test multiple languages per country and did not examine countries sharing the same primary language, noting 'our cultural alignment evaluations cannot be considered complete' (Section 6). These are specific things the results do NOT show."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The raw LLM responses (12,000 API call outputs across 10 models × 20 countries × 20 languages × 3 trials) are not released or made available for independent verification."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 3 and Table 1 describe the data collection procedure: each of 10 models was prompted with 20 country-roles in 20 languages using the VSM questionnaire, 3 times each, at temperature=0. Ground truth sources are specified (Hofstede official website and a third-party consultancy)."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants are involved. Data sources are LLM APIs and published Hofstede cultural dimension scores."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The pipeline from collection to analysis is documented: prompt LLMs (Table 1) → extract numerical responses → calculate dimension scores (Equation 2) → normalize to 0-100 → average over 3 runs → compare to ground truth → compute deviation ratio (Equation 1)."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding sources, grants, or sponsors are disclosed anywhere in the paper."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations with Tsinghua University and BNRist are listed. Table 2 notes GLM-4 is from 'Zhipu AI/Tsinghua,' making the connection between the authors' institution and one of the evaluated models visible."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding is disclosed, making it impossible to assess funder independence. The authors are affiliated with Tsinghua University, which is connected to GLM-4 (found to be the best-performing model), creating a potential undisclosed conflict."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is included in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates are stated for any of the 10 models tested."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The Hofstede Values Survey Module and its associated country scores have been published since 1980 and are extremely widely available online. All models tested were almost certainly trained on data containing these ground truth scores. This critical overlap is not discussed."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "Hofstede's cultural dimensions framework and country scores are among the most widely cited resources in cross-cultural studies, available on multiple websites since the 1980s. All tested models were trained well after this data was published. The possibility that models have memorized Hofstede scores rather than representing genuine cultural understanding is not addressed."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study. LLMs are prompted, not humans."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants are involved in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants are involved in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "The study involves approximately 12,000+ API calls (10 models × 20 countries × 20 languages × 3 trials × 24 questions), but no inference costs, API spending, or latency figures are reported."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No computational budget, total API spend, or hardware information is stated."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Three runs are performed per condition but no sensitivity analysis or variance across runs is reported. Temperature=0 should produce deterministic outputs, yet no discussion of whether outputs actually varied across the three runs."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "Section 3 explicitly states 'each country-language prompt was called three times and averaged for each model.'"
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No hyperparameter search process is described. The choice of temperature=0 and 3 runs is not justified."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No configuration selection process is described. The prompting format (Table 1) is presented without comparing alternative prompting strategies."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper makes numerous comparisons across 10 models, 20 countries, 6 dimensions, and multiple language conditions without any correction for multiple comparisons."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors are from Tsinghua University, which developed GLM-4 (via Zhipu AI/Tsinghua). GLM-4 is found to be the best-performing model. No acknowledgment of self-evaluation bias is made."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "Figure 6(a) plots model size (parameters) vs alignment, but does not report actual compute budgets or inference costs. Parameters are an imperfect proxy for compute."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper calls Hofstede's VSM 'the gold standard of cultural studies' without questioning whether forced-choice survey responses from LLMs measure the same construct as aggregated human cultural values, or whether LLM survey responses map to actual cultural behavior in downstream applications."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is used. Models are prompted directly via their APIs."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "Hofstede's cultural dimension scores and the VSM questionnaire have been published since 1980 and are available on numerous websites. All tested models were trained well after this data was published, yet temporal leakage is not discussed."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The structured format of VSM questions with fixed response options could cue models that have encountered this specific questionnaire in training data. This form of feature leakage is not discussed."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "The 24 VSM questions and Hofstede country scores are widely reproduced across academic papers, textbooks, and websites. The near-certainty that this exact content appears in training data is not addressed."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, or decontamination analysis is performed."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "The United States is the most closely aligned country across all models by a wide margin (deviation ratio 1.99 vs. next-best Germany at 1.13).",
    370       "evidence": "Table 3 shows country deviation ratios; Figure 2 shows the US as a clear outlier with high deviation from global average yet low difference from ground truth. Section 4.1.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "GLM-4 has the best ability to align to cultural values among all 10 models tested, despite having only 9 billion parameters.",
    375       "evidence": "Figure 3(b) ranks GLM-4 first on the deviation ratio metric across all prompting methods. Figure 6(a) shows GLM-4 as an outlier with high alignment at low parameter count. Section 4.1.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "LLMs converge toward a moderate global average culture close to the median possible value for all dimensions, regardless of country-role or prompt language.",
    380       "evidence": "Figure 1(a) shows all country representations clustered near the center of the radar chart, in stark contrast to the spread of ground truths in Figure 1(b). Section 4.1.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Both China-origin and US-origin models align better with US culture than with Chinese culture, regardless of prompt language.",
    385       "evidence": "Figure 4 shows deviation ratios for US and China alignment by model origin and language. US alignment ranges 1.14-1.28 while China alignment ranges 0.72-0.77 across all conditions. Section 4.2.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "The percentage of web content from a country strongly correlates with how well LLMs align with that country's culture (Pearson r=0.94).",
    390       "evidence": "Figure 6(c) plots web content percentage vs deviation ratio with r=0.94. Section 4.3.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Model size has minimal correlation with cultural alignment ability (Pearson r=0.13).",
    395       "evidence": "Figure 6(a) plots model size on log scale vs average deviation ratio. The near-zero correlation is heavily influenced by GLM-4 (9B params) outperforming all larger models. Section 4.3.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "English prompts lead to better cultural alignment on average than Chinese or other language prompts.",
    400       "evidence": "Figure 5 shows English prompts yielding higher average deviation ratios (1.02 and 1.00) than Chinese (0.99 and 0.96) or non-EN/ZH (0.94 and 0.95) for US-origin and China-origin models respectively. Section 4.2.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "Undisclosed conflict of interest",
    407       "detail": "All four authors are affiliated with Tsinghua University. GLM-4 was developed by Zhipu AI/Tsinghua and is found to be the best-performing model overall. This conflict is not acknowledged, and no mitigation strategies (e.g., blinded evaluation, independent analysis) are described."
    408     },
    409     {
    410       "flag": "Massive contamination risk unaddressed",
    411       "detail": "Hofstede's cultural dimensions and country scores have been published since 1980 and appear on countless websites, textbooks, and academic papers. All tested models almost certainly encountered this exact data during training. The study cannot distinguish whether models are exhibiting genuine cultural understanding or simply recalling memorized Hofstede scores. This fundamental threat to validity is not discussed."
    412     },
    413     {
    414       "flag": "No error bars or uncertainty quantification",
    415       "detail": "Despite running each condition 3 times, no standard deviation, variance, or confidence intervals are reported. All results are point estimates, making it impossible to assess whether observed differences between models or countries are statistically meaningful."
    416     },
    417     {
    418       "flag": "No significance tests for comparative claims",
    419       "detail": "The paper ranks all 10 models and 20 countries but performs no statistical tests to determine whether ranking differences are significant. The difference between models ranked 2nd and 8th may be within noise."
    420     },
    421     {
    422       "flag": "Construct validity concern",
    423       "detail": "The paper assumes that LLM responses to a forced-choice cultural questionnaire measure 'cultural value alignment,' but does not discuss whether this maps to actual cultural behavior in LLM outputs during real interactions. An LLM could give culturally appropriate survey answers by memorizing Hofstede scores while still producing culturally biased text."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "A survey of large language models",
    429       "authors": ["Wayne Xin Zhao", "Kun Zhou", "Junyi Li"],
    430       "year": 2023,
    431       "arxiv_id": "2303.18223",
    432       "relevance": "Comprehensive survey of LLM capabilities and limitations, foundational reference for the LLM evaluation field."
    433     },
    434     {
    435       "title": "Unintended Impacts of LLM Alignment on Global Representation",
    436       "authors": ["Michael J. Ryan", "William Held", "Diyi Yang"],
    437       "year": 2024,
    438       "arxiv_id": "2402.15018",
    439       "relevance": "Directly studies how LLM alignment processes affect cultural representation, closely related to cultural bias in LLM outputs."
    440     },
    441     {
    442       "title": "Assessing Cross-Cultural Alignment between ChatGPT and Human Societies: An Empirical Study",
    443       "authors": ["Yong Cao", "Li Zhou", "Seolhwa Lee"],
    444       "year": 2023,
    445       "arxiv_id": "2303.17466",
    446       "relevance": "Prior work evaluating cultural alignment using Hofstede's framework on GPT models with five languages."
    447     },
    448     {
    449       "title": "BLEnD: A Benchmark for LLMs on Everyday Knowledge in Diverse Cultures and Languages",
    450       "authors": ["Junho Myung", "Nayeon Lee", "Yi Zhou"],
    451       "year": 2024,
    452       "relevance": "NeurIPS benchmark for evaluating LLMs' ability to adhere to culturally specific knowledge across languages."
    453     },
    454     {
    455       "title": "Investigating Cultural Alignment of Large Language Models",
    456       "authors": ["Badr AlKhamissi", "Muhammad ElNokrashy", "Mai AlKhamissi", "Mona Diab"],
    457       "year": 2024,
    458       "arxiv_id": "2402.13231",
    459       "relevance": "Evaluates cultural alignment of LLMs for Egypt and US using Hofstede's survey, a direct methodological predecessor."
    460     },
    461     {
    462       "title": "Having Beer after Prayer? Measuring Cultural Bias in Large Language Models",
    463       "authors": ["Tarek Naous", "Michael J. Ryan", "Alan Ritter", "Wei Xu"],
    464       "year": 2024,
    465       "arxiv_id": "2305.14456",
    466       "relevance": "Confirms Western cultural bias in LLMs due to training corpora composition."
    467     },
    468     {
    469       "title": "Large Language Models Reflect the Ideology of their Creators",
    470       "authors": ["Maarten Buyl", "Alexander Rogiers", "Sander Noels"],
    471       "year": 2024,
    472       "arxiv_id": "2410.18417",
    473       "relevance": "Studies how LLM creators' ideological values are embedded in model outputs, directly relevant to model-origin analysis."
    474     },
    475     {
    476       "title": "Whose Opinions Do Language Models Reflect?",
    477       "authors": ["Shibani Santurkar", "Esin Durmus", "Faisal Ladhak"],
    478       "year": 2023,
    479       "relevance": "Examines whose values and opinions LLMs represent, foundational work on alignment and representation bias."
    480     },
    481     {
    482       "title": "Towards Measuring the Representation of Subjective Global Opinions in Language Models",
    483       "authors": ["Esin Durmus", "Karina Nyugen", "Thomas I. Liao"],
    484       "year": 2023,
    485       "arxiv_id": "2306.16388",
    486       "relevance": "Proposes culture auditing methods and finds cross-national prompting can adapt LLM responses to specific cultures."
    487     },
    488     {
    489       "title": "Auditing and Mitigating Cultural Bias in LLMs",
    490       "authors": ["Yan Tao", "Olga Viberg", "Ryan S. Baker", "Rene F. Kizilcec"],
    491       "year": 2023,
    492       "arxiv_id": "2311.14096",
    493       "relevance": "Quantifies cultural bias of GPT models using the Integrated Values Survey and proposes cultural prompting mitigation strategies."
    494     },
    495     {
    496       "title": "SafeWorld: Geo-Diverse Safety Alignment",
    497       "authors": ["Da Yin", "Haoyi Qiu", "Kung-Hsiang Huang", "Kai-Wei Chang", "Nanyun Peng"],
    498       "year": 2025,
    499       "relevance": "Addresses cultural and legal appropriateness in LLM social interactions across geographic regions."
    500     },
    501     {
    502       "title": "The Alignment Problem from a Deep Learning Perspective",
    503       "authors": ["Richard Ngo", "Lawrence Chan", "Sören Mindermann"],
    504       "year": 2024,
    505       "arxiv_id": "2209.00626",
    506       "relevance": "Foundational work on AI alignment challenges relevant to cultural value alignment assessment."
    507     }
    508   ],
    509   "engagement_factors": {
    510     "practical_relevance": {
    511       "score": 1,
    512       "justification": "Findings highlight cultural bias but offer no tool, technique, or actionable fix for practitioners."
    513     },
    514     "surprise_contrarian": {
    515       "score": 1,
    516       "justification": "Largely confirms expected Western/US bias; GLM-4 outperforming larger models is mildly surprising but not paradigm-shifting."
    517     },
    518     "fear_safety": {
    519       "score": 1,
    520       "justification": "Cultural bias propagation is a concern but is already widely discussed; no novel attack or existential risk demonstrated."
    521     },
    522     "drama_conflict": {
    523       "score": 1,
    524       "justification": "Mild US-vs-China model comparison angle, but conclusions are measured and diplomatic."
    525     },
    526     "demo_ability": {
    527       "score": 0,
    528       "justification": "No code, demo, dataset release, or pip-installable tool is provided."
    529     },
    530     "brand_recognition": {
    531       "score": 2,
    532       "justification": "Involves well-known models (GPT-4, GPT-4o, Gemini, LLaMA, Qwen, DeepSeek) but authors and lab are not widely recognized."
    533     }
    534   }
    535 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs