ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26000B)


      1 {
      2   "paper": {
      3     "title": "CAReDiO: Cultural Alignment of LLM via Representativeness and Distinctiveness Guided Data Optimization",
      4     "authors": [
      5       "Jing Yao",
      6       "Xiaoyuan Yi",
      7       "Jindong Wang",
      8       "Zhicheng Dou",
      9       "Xing Xie"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv preprint",
     13     "arxiv_id": "2504.08820"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "The paper states 'We would release the code and synthesized data for reproduction' (Sec. 4.3), which is a promise of future release. No working URL or repository link is provided anywhere in the paper."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "Same as code — the paper promises to release synthesized data but no download link or repository URL is provided. The evaluation benchmarks (GlobalOpinionQA, CulturalBench, CultureBank, Prism) are publicly available, but CARDSet itself is not released."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper mentions 'Experiments are completed using NVIDIA A100 (80G)' and names model backbones (Llama-3.1-8B-Instruct, Qwen2.5-7B-Instruct), but provides no requirements.txt, Dockerfile, library versions, or detailed environment setup."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The method is described algorithmically but there are no runnable commands or concrete reproduction guides."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "All results in Tables 2-6 are reported as point estimates only. No confidence intervals, error bars, or ± notation is used anywhere in the paper."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper claims CAReDiO 'significantly enhances cultural alignment' and 'surpasses the effects of the simple role-playing strategy and various fine-tuning baselines' (Sec. 5.1) based solely on comparing numbers in tables without any statistical significance tests."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Results are reported as raw scores (accuracy, 1-JS distance, GPT-4o quality scores) without contextualizing effect sizes such as Cohen's d, percentage improvement with baseline context, or other standardized measures."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper uses 1,000 training samples per culture without justifying why this number was chosen. No power analysis or discussion of whether this sample size is sufficient for the claims made."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No standard deviation, variance, or spread measures are reported across any experimental runs. All results appear to be single-run numbers. There is no mention of multiple seeds or repeated experiments."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper compares against multiple baselines in three categories: generally aligned LLMs (GPT-3.5-turbo, GPT-4-turbo, GPT-4o-mini), role-playing instructions, and fine-tuned culture-specific LLMs (CultureLLM, CulturePark, CultureSPA, CultureBank) as described in Sec. 4.2."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The baselines include recent works: CultureLLM (2024), CulturePark (2024), CultureSPA (2024), CultureBank (2024), and contemporary models like Llama-3.1-8B-Instruct and Qwen2.5-7B-Instruct. These are the current state-of-the-art in cultural alignment."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper presents two variants of the framework — CAReDiO-Cluster and CAReDiO-In context — which use different representativeness metrics. Section 5.2 varies training sample sizes from 100 to 1,000. However, there is no ablation that isolates the individual effects of representativeness optimization vs. distinctiveness optimization vs. cognitive conflict theory."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The paper evaluates on four distinct benchmarks with different metrics: 1 - JS Distance (GlobalOpinionQA), Accuracy (CulturalBench-Hard), GPT-4o response quality (CultureBank and Prism)."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "All evaluations are automated. CultureBank and Prism response quality is scored by GPT-4o (1-5 scale), not by human judges. No human evaluation of the system's culturally aligned outputs is conducted. Given the paper claims to improve cultural alignment — a subjective quality — human evaluation would be relevant."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The evaluation benchmarks (GlobalOpinionQA, CulturalBench-Hard, CultureBank test split, Prism) are all separate from the CARDSet training data used for fine-tuning. CulturalBench is used only for the in-context representativeness metric, and the paper uses CulturalBench-Hard (a different version) for evaluation."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Tables 3-6 in the appendix provide per-culture breakdowns (United Kingdom, China, South Korea, India, Singapore) for all four benchmarks, rather than just reporting averages."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "The paper shows only successful case studies (Figures 4, Tables 7-8) where CAReDiO outperforms baselines. No failure cases, error analysis, or examples where the approach breaks down are discussed."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "The paper does not report any configurations or approaches that failed. The CAReDiO-In context variant performs slightly worse than CAReDiO-Cluster on some benchmarks, but this is not discussed as a negative finding. On CulturalBench-Hard with Qwen2.5, CAReDiO underperforms CultureLLM (32.35 vs 43.35) but this is not acknowledged in the text."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The abstract claims 'our method generates more effective data and enables cultural alignment with as few as 100 training samples, enhancing both performance and efficiency.' While Figure 2 shows results at 100 samples, CAReDiO does not consistently outperform all baselines across all benchmarks — notably on CulturalBench-Hard, CAReDiO underperforms CultureLLM with Qwen2.5 (32.35 vs 43.35). The sweeping claim of superiority is not fully supported."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper makes causal claims: 'leveraging the cultural data synthesized through our framework significantly enhances cultural alignment' (Sec. 5.1) and implies the representativeness/distinctiveness optimization causes better alignment. However, multiple variables change simultaneously (data generation method, selection strategy, prompting approach) without controlled single-variable manipulation to isolate causation."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper tests on 5 cultures only (UK, China, South Korea, India, Singapore) but frames the approach broadly as enabling 'cultural alignment' generally. The title 'Cultural Alignment of LLM' and abstract language suggest generality beyond the 5 tested cultures. Limitation (2) mentions this but the title and abstract do not bound the claims."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper does not discuss alternative explanations for its results. For instance, the improvements could be due to simply using GPT-4o-mini for data generation (a more capable model than baselines used), the question adaptation step, or the data selection filtering — but these alternatives are not considered. The limitations section discusses constraints but not confounds."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper mentions 'GPT-4o-mini' for data generation and evaluation, 'gpt-3.5-turbo', 'gpt-4-turbo' for baselines, and 'Llama-3.1-8B-Instruct' and 'Qwen2.5-7B-Instruct' as backbone models. However, no API version numbers or snapshot dates are specified (e.g., no 'gpt-4o-mini-2024-07-18' or similar). Marketing names without versions are insufficient."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper describes its prompting approach (cultural question adaptation via chain-of-thought, cognitive conflict theory for response generation, role-playing instructions) but does not provide the actual prompt text used. The process is described in natural language only."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The paper reports the clustering threshold theta=0.7, uses 100 questions per topic and 1000 training samples. However, no LLM API hyperparameters (temperature, top-p, max_tokens) are reported for either the generation or evaluation stages. Fine-tuning hyperparameters (learning rate, epochs, batch size) are also missing."
    149       },
    150       "scaffolding_described": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "The paper does not use agentic scaffolding. The pipeline is a data construction and fine-tuning framework, not an agentic system with tool use, retry logic, or memory management."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 3 describes the full data construction pipeline: cultural framework with 38 topics across 4 levels, Self-Instruct to generate questions per topic, cultural question adaptation, cognitive conflict response generation, clustering with threshold 0.7, redundancy removal, representativeness and distinctiveness scoring, and selection by combined score. The pipeline stages are clearly described."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 7 is dedicated to 'Limitations' and discusses four specific limitations in substantive detail."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The limitations section discusses specific threats: (1) reliance on LLMs for data generation introduces cultural bias from those LLMs, possibly inadequate for low-resource cultures; (2) only 5 cultures tested; (3) representativeness optimization may miss long-tail or emerging practices; (4) only SFT used, not DPO. These are specific to this study."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "While the limitations mention only 5 cultures were tested and the method relies on LLM-generated data, the paper does not explicitly state what the results do NOT show. There is no explicit bounding statement like 'our results apply only to X and do not demonstrate Y.' The limitations are framed as future work rather than explicit scope boundaries."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "Neither the raw generated data (CARDSet) nor the intermediate outputs (embeddings, cluster assignments, scores) are made available. The paper promises future release but provides no access."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 3 describes the data collection procedure in detail: how questions are generated using Self-Instruct from a 38-topic cultural framework, how they are adapted per culture using GPT-4o-mini with chain-of-thought reasoning, and how responses are generated using cognitive conflict theory."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants are involved. Data is synthetically generated by LLMs. The evaluation benchmarks are pre-existing public datasets."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The full pipeline is documented in Section 3: cultural framework construction → question generation (100 per topic via Self-Instruct) → cultural question adaptation → distinctive response generation → embedding and clustering (theta=0.7) → representativeness and distinctiveness scoring → selection by combined score. Section 4.3 specifies 1000 samples per culture for training."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No funding source or acknowledgments section is present in the paper. Authors are from Microsoft Research Asia, William & Mary, and Renmin University of China, but no grants or funding are disclosed."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Author affiliations are clearly listed: Microsoft Research Asia (Jing Yao, Xiaoyuan Yi, Xing Xie), William & Mary (Jindong Wang), and Renmin University of China (Zhicheng Dou)."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding information is disclosed. Three of five authors are from Microsoft Research Asia, and the paper uses Microsoft/OpenAI APIs (GPT-4o-mini) for data generation and evaluation. The potential conflict of interest between the corporate affiliation and the use of proprietary APIs is not acknowledged."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests statement or financial interest declarations are present in the paper."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The paper fine-tunes Llama-3.1-8B-Instruct and Qwen2.5-7B-Instruct and evaluates them on cultural benchmarks. No training data cutoff dates are stated for any of the models used."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The evaluation benchmarks (GlobalOpinionQA, CulturalBench, CultureBank, Prism) are public and may overlap with the pre-training data of the backbone models. This is not discussed."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "Several evaluation benchmarks (GlobalOpinionQA 2023, CulturalBench 2024, CultureBank 2024) were published before the training data of the backbone models was likely collected. No contamination analysis is provided."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this study. All data is synthetically generated and evaluation is automated."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants are involved."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "The paper proposes a method that calls GPT-4o-mini for data generation (question adaptation, response generation) and uses OpenAI embedding APIs for clustering. No API costs, token counts, or wall-clock time are reported for any stage of the pipeline."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "The paper mentions 'Experiments are completed using NVIDIA A100 (80G)' (Sec. 4.3) but does not quantify total GPU hours, training time, or total API spend for the data generation pipeline."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "CAReDiO generates more effective cultural alignment data than existing methods, enabling alignment with as few as 100 training samples.",
    292       "evidence": "Figure 2 shows performance curves from 100 to 1000 samples. On the Prism benchmark, CAReDiO reaches top performance with 100 samples. Table 2 shows overall averages across 4 benchmarks.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "CAReDiO consistently surpasses fine-tuning baselines and role-playing strategies across most benchmarks.",
    297       "evidence": "Table 2 shows CAReDiO outperforms baselines on GlobalOpinionQA, CultureBank, and Prism for both backbone models. However, on CulturalBench-Hard, CAReDiO-Cluster scores 34.90 with Llama (vs CultureLLM 31.51, CultureSPA 32.61) but CAReDiO-Cluster scores 32.35 with Qwen (vs CultureLLM 43.35). Results are mixed.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "CAReDiO's data shows superior clustering with clearly distinct groups for different cultures compared to CulturePark.",
    302       "evidence": "Figure 3(a) shows PCA visualization of CAReDiO vs CulturePark embeddings. CAReDiO shows more separated clusters. This is a qualitative visual comparison without quantitative clustering metrics.",
    303       "supported": "weak"
    304     },
    305     {
    306       "claim": "CAReDiO models demonstrate competitive performance with state-of-the-art proprietary LLMs like GPT-4o-mini and GPT-4-turbo on the Prism dataset.",
    307       "evidence": "Table 2 shows CAReDiO (Llama) scores 4.12 on Prism vs GPT-4o-mini + Role-Play at 4.05 and GPT-4-turbo + Role-Play at 3.92. CAReDiO (Qwen) scores 4.03, also competitive.",
    308       "supported": "strong"
    309     }
    310   ],
    311   "methodology_tags": [
    312     "benchmark-eval"
    313   ],
    314   "key_findings": "CAReDiO introduces a framework for constructing cultural alignment data by optimizing representativeness and distinctiveness, grounded in culture theory. The method uses LLMs to generate and select cultural conversation data, then fine-tunes smaller models. On four cultural benchmarks across five cultures, CAReDiO-fine-tuned models (Llama-3.1-8B, Qwen2.5-7B) generally outperform prior cultural alignment methods, particularly on open-ended benchmarks like Prism and CultureBank. The approach achieves comparable or superior performance with as few as 100-500 training samples, suggesting efficiency gains over existing methods.",
    315   "red_flags": [
    316     {
    317       "flag": "GPT-4o as evaluator without validation",
    318       "detail": "Two of four evaluation benchmarks (CultureBank and Prism) use GPT-4o to score response quality on a 1-5 scale. No validation of GPT-4o's evaluation reliability or correlation with human judgment is provided. This is particularly concerning for cultural sensitivity evaluation, where LLM judges may have their own cultural biases."
    319     },
    320     {
    321       "flag": "No statistical testing on comparative claims",
    322       "detail": "All claims of superiority ('significantly enhances', 'surpasses') are based on comparing point estimates without significance tests, confidence intervals, or any indication of result variance. Single-run results cannot support claims of significant improvement."
    323     },
    324     {
    325       "flag": "Inconsistent performance not acknowledged",
    326       "detail": "On CulturalBench-Hard with Qwen2.5, CAReDiO-Cluster scores 32.35 while CultureLLM scores 43.35 — a large underperformance. This is visible in Table 4 but not discussed in the analysis, which focuses on aggregate averages and cherry-picked successes."
    327     },
    328     {
    329       "flag": "Microsoft affiliation evaluating with Microsoft/OpenAI APIs",
    330       "detail": "Three of five authors are from Microsoft Research Asia. The framework relies on GPT-4o-mini (a Microsoft/OpenAI product) for data generation and GPT-4o for evaluation. This potential conflict of interest is not acknowledged."
    331     },
    332     {
    333       "flag": "Circular evaluation risk",
    334       "detail": "GPT-4o-mini is used to generate the cultural training data, and GPT-4o is used to evaluate response quality on CultureBank and Prism. Using the same model family for both generation and evaluation creates a risk of stylistic bias favoring the approach."
    335     }
    336   ],
    337   "cited_papers": [
    338     {
    339       "title": "CultureLLM: Incorporating Cultural Differences into Large Language Models",
    340       "authors": ["Cheng Li", "Mengzhou Chen", "Jindong Wang", "Sunayana Sitaram", "Xing Xie"],
    341       "year": 2024,
    342       "arxiv_id": "2402.10946",
    343       "relevance": "Directly comparable cultural alignment method using World Value Survey data as seeds for LLM-generated training data."
    344     },
    345     {
    346       "title": "CulturePark: Boosting Cross-Cultural Understanding in Large Language Models",
    347       "authors": ["Cheng Li", "Damien Teney", "Linyi Yang", "Qingsong Wen", "Xing Xie", "Jindong Wang"],
    348       "year": 2024,
    349       "arxiv_id": "2405.15145",
    350       "relevance": "Multi-agent framework baseline for cultural data generation, directly compared in the evaluation."
    351     },
    352     {
    353       "title": "Towards Measuring the Representation of Subjective Global Opinions in Language Models",
    354       "authors": ["Esin Durmus", "Karina Nyugen", "Thomas I Liao"],
    355       "year": 2023,
    356       "arxiv_id": "2306.16388",
    357       "relevance": "GlobalOpinionQA benchmark used for evaluation; foundational work on cultural representation in LLMs."
    358     },
    359     {
    360       "title": "CulturalBench: A Robust, Diverse and Challenging Benchmark on Measuring the (Lack of) Cultural Knowledge of LLMs",
    361       "authors": ["Yu Ying Chiu", "Liwei Jiang", "Bill Yuchen Lin"],
    362       "year": 2024,
    363       "arxiv_id": "2410.02677",
    364       "relevance": "Evaluation benchmark for cultural knowledge in LLMs, used for both representativeness scoring and evaluation."
    365     },
    366     {
    367       "title": "The Prism Alignment Dataset: What Participatory, Representative and Individualised Human Feedback Reveals about the Subjective and Multicultural Alignment of Large Language Models",
    368       "authors": ["Hannah Rose Kirk", "Alexander Whitefield", "Paul Rottger"],
    369       "year": 2025,
    370       "relevance": "Evaluation benchmark with real cross-cultural conversations; measures cultural awareness in LLM responses."
    371     },
    372     {
    373       "title": "Self-Pluralising Culture Alignment for Large Language Models",
    374       "authors": ["Shaoyang Xu", "Yongqi Leng", "Linhao Yu", "Deyi Xiong"],
    375       "year": 2024,
    376       "arxiv_id": "2410.12971",
    377       "relevance": "CultureSPA baseline method for identifying culture-representative data through activation analysis."
    378     },
    379     {
    380       "title": "CultureBank: An Online Community-Driven Knowledge Base towards Culturally Aware Language Technologies",
    381       "authors": ["Weiyan Shi", "Ryan Li", "Yutong Zhang"],
    382       "year": 2024,
    383       "arxiv_id": "2404.15238",
    384       "relevance": "Cultural knowledge base from TikTok narratives, used as both a baseline and evaluation benchmark."
    385     },
    386     {
    387       "title": "Training Language Models to Follow Instructions with Human Feedback",
    388       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    389       "year": 2022,
    390       "relevance": "Foundational RLHF alignment work that this paper extends to cultural alignment settings."
    391     },
    392     {
    393       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    394       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    395       "year": 2024,
    396       "relevance": "DPO alignment algorithm discussed as an alternative to SFT for cultural fine-tuning."
    397     },
    398     {
    399       "title": "Unintended Impacts of LLM Alignment on Global Representation",
    400       "authors": ["Michael J Ryan", "William Held", "Diyi Yang"],
    401       "year": 2024,
    402       "arxiv_id": "2402.15018",
    403       "relevance": "Demonstrates that standard alignment techniques can increase Western cultural bias in LLMs, motivating cultural alignment research."
    404     }
    405   ]
    406 }

Impressum · Datenschutz