ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (31075B)


      1 {
      2   "paper": {
      3     "title": "Unintended Impacts of LLM Alignment on Global Representation",
      4     "authors": [
      5       "Michael J. Ryan",
      6       "William Held",
      7       "Diyi Yang"
      8     ],
      9     "year": 2024,
     10     "venue": "Annual Meeting of the Association for Computational Linguistics",
     11     "arxiv_id": "2402.15018",
     12     "doi": "10.48550/arXiv.2402.15018"
     13   },
     14   "scan_version": 3,
     15   "active_modules": [
     16     "experimental_rigor",
     17     "data_leakage"
     18   ],
     19   "methodology_tags": [
     20     "benchmark-eval",
     21     "observational"
     22   ],
     23   "key_findings": "Alignment (SFT + preference tuning) improves English task performance across dialects but significantly increases US English disparity over Indian and Nigerian English (up to 17.1%). Alignment largely improves multilingual performance due to unintentional multilingual data in SFT datasets (~13.1% of Tülu SFT data is non-English). All alignment procedures increase model agreement with US opinions relative to non-Western nations (Jordan, China, Nigeria). The Starling reward model correlates highly with US citizen opinions (Spearman 0.926) but these preferences do not propagate to the preference-tuned LM on out-of-distribution settings.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper states 'We make our code and data publicly available on Github' and provides a URL: https://github.com/SALT-NLP/unintended-impacts-of-alignment (Abstract and footnote 1)."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The authors release the AskReddit dataset of 554 country-specific questions. The other datasets used (MD3, TyDiQA, Belebele, GlobalOpinionsQA) are publicly available benchmarks. The paper states code and data are publicly available."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper mentions 'All experiments were performed on an A6000 GPU' and '8-bit quantization using the BitsAndBytes library' (Appendix B), but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper provides prompts in the appendix and describes the experimental setup, but does not include step-by-step reproduction instructions. The GitHub repo is referenced but the paper itself lacks a 'Reproducing Results' section."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports 95% confidence intervals throughout, e.g., 'Figure 3... with 95% confidence intervals' for MD3 and 'Figure 5... with 95% confidence intervals' for GlobalOpinionsQA."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper uses significance tests with p<0.05 threshold. For MD3: 'Whenever changes to performance are significant (p<0.05)'. For Belebele: asterisks mark significant differences (Figure 4). Spearman correlation reported with p-values (e.g., p=1.78E-9)."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Effect sizes are reported with baseline context throughout: 'Indian English accuracy increased by 15.2%, Nigerian English accuracy increased by 20.3% and American English accuracy increased by 29.3%' (§4). Spearman correlations reported as 0.926 and 0.849."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No power analysis or justification for sample sizes. The paper uses existing benchmark sizes without discussing whether they are adequate for the claims being made."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The paper reports confidence intervals computed from dataset instances but does not report variance across multiple experimental runs. Greedy decoding is used (deterministic), which partially mitigates this, but for reading comprehension tasks with random 1-shot demonstrations (TyDiQA), variance across demonstration samples is not reported."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The core design compares base models against SFT and preference-tuned variants: Llama 2 → Llama 2 Chat, Llama 2 → Tülu SFT → Tülu DPO, Mistral → Mistral SFT → Zephyr, Mistral → OpenChat → Starling (Table 1, Figures 3-5)."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The models evaluated (Llama 2, Mistral v0.1, Tülu 2, Starling, Zephyr) were all contemporary at the time of writing. Additional validation with Qwen 1.5 and Yi-6B in the appendix."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The staged comparison (Base → SFT → PT) effectively ablates the contribution of each alignment step. The paper systematically shows effects of SFT and PT separately across all three axes of evaluation."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Multiple metrics are used across tasks: accuracy for MD3 intent prediction, CFMScore for TyDiQA, accuracy for Belebele, 1-Jensen Shannon divergence for GlobalOpinionsQA, Spearman rank correlation for reward model probing, and mean rank for country rankings."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "All evaluation of model outputs is automated. Two authors manually labeled the AskReddit dataset for quality (Cohen's kappa 0.963), but this is dataset construction, not evaluation of model outputs."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper uses standard test sets from established benchmarks (MD3, TyDiQA GoldP, Belebele, GlobalOpinionsQA). These are separate from any model training or tuning data."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Extensive per-category breakdowns: results by dialect (US, Indian, Nigerian) in Figure 3, by language (9 languages) in Figure 4, by country (7 countries) in Figure 5, and by question category (11 categories) for the AskReddit dataset."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Multiple failure cases discussed: Bengali performance decreases across all models (§5), Zephyr TyDiQA performance decreases significantly in six of nine languages, Qwen Chat's reasoning preamble exceeding token limits (Appendix F.1)."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Several negative results: Bengali performance worsens across all models, Zephyr TyDiQA decreases in 6/9 languages, reward model biases do NOT propagate to the preference-tuned LM (§6.1), and alignment increases disparity between dialects."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The three main abstract claims are supported: (1) alignment creates disparities between English dialects (Figure 3), (2) alignment improves multilingual capabilities (Figure 4), (3) alignment increases similarity to US opinions (Figure 5). The claim about code/data availability is supported by the GitHub link."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper makes causal claims (e.g., 'alignment improves performance', 'alignment increases the disparity') but uses observational comparison of existing model checkpoints. The authors acknowledge in Limitations: 'Since we use open checkpoints rather than aligning the models ourselves, we cannot directly test individual changes to the alignment procedure and their downstream impacts.'"
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The title 'Unintended Impacts of LLM Alignment on Global Representation' frames results broadly. However, the study tests only 7B open-source models from Llama 2 and Mistral families (with Qwen/Yi in appendix). The abstract says 'current alignment procedures create disparities' without qualifying to 7B models or specific model families."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper substantively discusses alternative explanations: multilingual SFT data composition explains multilingual improvements (Table 2, §5), pre-training data defines out-of-distribution preferences rather than reward models (§6.1), and different SFT datasets explain varying performance (UltraChat vs Tülu mix)."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper measures specific proxies (intent prediction accuracy, extractive QA scores, reading comprehension accuracy, opinion distribution similarity) and frames them as aspects of 'global representation' rather than claiming to capture all of it. The limitations acknowledge 'we test on a limited set of tasks.'"
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Exact model names with versions are specified: 'Llama 2 7B', 'Mistral v0.1 7B', 'Tülu 2 7B DPO', 'Starling LM 7B', 'Zephyr-7B-beta', 'OpenChat3.5', 'Qwen1.5-7B', 'Yi-6B' (§3, Table 1, Appendix F). These are specific HuggingFace model identifiers."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Full prompt text is provided in the appendix for all four tasks: MD3 (Table 8), TyDiQA (Table 9), Belebele (Table 10), and GlobalOpinionsQA (Table 11)."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Key hyperparameters are reported: greedy decoding for generation tasks (Appendix B), 8-bit quantization via BitsAndBytes, 1-shot setting for TyDiQA, and language modeling probability for Belebele answer selection."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. The paper evaluates models directly on benchmark tasks with standard prompting."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Data preprocessing is documented: MD3 transcripts filtered to successful guesses (§4), GlobalOpinionsQA filtered to 245 questions with responses from all 7 countries (§6), AskReddit filtered from 957 to 554 questions with documented removal criteria (§6.1), SFT datasets analyzed with language ID (§5, Table 2)."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "A dedicated 'Limitations' section discusses specific constraints: inability to disentangle SFT/RLHF for Llama 2 Chat, use of released checkpoints rather than controlled alignment, and limited set of tasks."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Specific threats discussed: 'Since Llama 2 SFT has not been publicly released, we cannot disentangle the effects of SFT and RLHF', 'Since we use open checkpoints rather than aligning the models ourselves, we cannot directly test individual changes', limited task coverage may miss other failure modes."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The paper explicitly states scope boundaries: 'Since we test on a limited set of tasks, it is possible that failure modes arise on tasks that we did not assess', 'We leave causal intervention and interpretability studies on the impacts of alignment to future work.'"
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The evaluation benchmarks (MD3, TyDiQA, Belebele, GlobalOpinionsQA) are publicly available. The new AskReddit dataset of 554 questions is released for academic use. Model weights are publicly available on HuggingFace."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Data collection for the AskReddit dataset is described in detail (§6.1): search queries used, initial count (957), filtering criteria, manual labeling by two authors with Cohen's kappa of 0.963, ChatGPT template generation, and manual validation. Existing benchmark origins are cited."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants were recruited. The paper evaluates existing models on existing benchmarks. The AskReddit questions come from a public subreddit, and two of the paper's authors performed manual labeling."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The data pipeline for AskReddit is documented: search queries → 957 questions → duplicate removal → quality filtering → 554 questions → manual positive/negative labeling (Cohen's kappa 0.963) → ChatGPT template generation → manual validation → categorization into 11 categories."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funding is disclosed in Acknowledgements: 'This work was funded in part by a Meta grant and an NSF grant IIS-2247357.'"
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are listed: Michael J. Ryan and Diyi Yang at Stanford University, William Held at Georgia Institute of Technology."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "Meta partially funded the research and Llama 2 (a Meta model) is one of the primary models evaluated. Meta has a financial interest in how its alignment procedures are perceived, creating a potential conflict."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests statement is present in the paper. The Meta funding relationship and potential conflicts are not explicitly discussed beyond the acknowledgements section."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No training data cutoff dates are stated for any of the nine models evaluated. The paper notes pre-training data 'is known to come from the open internet' (Table 1) but does not specify temporal boundaries."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No analysis of whether benchmark examples appeared in the models' training data. The benchmarks used (MD3, TyDiQA, Belebele, GlobalOpinionsQA) could potentially overlap with pre-training corpora, but this is not discussed."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "No discussion of benchmark contamination risk. TyDiQA (2020) and Belebele (2023) were published before or contemporaneously with the evaluated models' training, creating potential contamination, but this is not addressed."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study. The paper evaluates language models on existing benchmarks."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants. The paper evaluates language models and creates a dataset from public Reddit posts."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants. Model evaluations are automated."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants recruited for the study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants or experimental conditions to randomize."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants; blinding is not applicable."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants; attrition is not applicable."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No inference cost or latency reported despite running 9 models across 4 benchmark tasks in multiple languages and dialects."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The paper mentions 'All experiments were performed on an A6000 GPU' (Appendix B) but does not state total GPU hours or computational budget."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No results reported across multiple random seeds. Greedy decoding mitigates randomness for generation tasks, but TyDiQA uses a random 1-shot demonstration, and seed sensitivity for this is not addressed."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The number of experimental runs is not explicitly stated. It is implied to be a single run with greedy decoding, but this is not confirmed."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No hyperparameter search is performed. The paper evaluates existing released model checkpoints with fixed evaluation settings."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No configuration selection is performed. The paper uses existing model checkpoints and standard evaluation protocols."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Many comparisons are made across models, languages, dialects, and countries with significance tests (p<0.05), but no correction for multiple comparisons (e.g., Bonferroni) is mentioned."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": false,
    330         "answer": false,
    331         "justification": "The paper does not propose its own system. It evaluates existing models from other developers."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "The paper compares alignment stages, not methods at different compute levels. All models are 7B parameters."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The paper does not discuss whether the chosen benchmarks (MD3 intent prediction, TyDiQA extractive QA, Belebele reading comprehension) are valid proxies for 'global representation.' The leap from specific task performance to 'representation' is not examined."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No scaffolding is used in the evaluation. Models are prompted directly."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether benchmark data (e.g., TyDiQA from 2020, Belebele from 2023) existed before the models' training data was collected."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether evaluation setup leaks information. For TyDiQA, gold passages are provided by design, but the paper does not discuss whether this represents real-world conditions."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No discussion of whether training and test data share structural similarities or are drawn from overlapping sources."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No leakage detection or prevention method is applied (e.g., no canary strings, membership inference, or n-gram overlap analysis)."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Alignment significantly increases the disparity between English dialects from about 1% before alignment to as high as 17.1% after alignment.",
    375       "evidence": "Figure 3 shows that base models perform relatively the same across dialects (~5% for Llama, ~8% for Mistral), but after SFT and PT, US English gains are significantly larger than Indian or Nigerian English gains (§4). Statistical significance at p<0.05 reported.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Alignment largely improves multilingual performance despite SFT data being primarily English.",
    380       "evidence": "Figure 4 shows significant improvements across most languages for Belebele reading comprehension and TyDiQA QA, especially for Tülu and Starling. Language ID reveals ~13.1% of Tülu SFT data is non-English (Table 2, §5).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "All evaluated alignment procedures increase relative agreement with US opinions compared to major non-Western nations (Jordan, China, Nigeria).",
    385       "evidence": "Figure 5 shows consistent shifts toward US opinion alignment across all four model alignment paths (Llama→Chat, Llama→Tülu, Mistral→Zephyr, Mistral→Starling). Additional validation with Qwen and Yi in Appendix F.3 (Figure 11) shows similar trends.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "The Starling reward model correlates highly with US citizen opinions of countries (Spearman 0.926 with 2017 Gallup, 0.849 with 2023 Gallup).",
    390       "evidence": "Table 3 compares Starling RM rankings of 19 countries against Gallup poll rankings, with Spearman correlations and p-values (p=1.78E-9 and p=1.12E-6 respectively) reported in §6.1.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Reward model preferences do not propagate to the preference-tuned language model for out-of-distribution settings.",
    395       "evidence": "Figure 6 shows Spearman correlation between Starling RM and Starling LM country rankings is only 0.59, much lower than within-family correlations (0.95-0.99 for Mistral family). Rankings are dominated by base model family (§6.1).",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "Unintentional multilingual data in SFT datasets (~13.1% of Tülu data) explains multilingual performance improvements.",
    400       "evidence": "Language ID analysis (Table 2) shows 13.1% of Tülu SFT data is non-English (51 languages), while UltraChat is 99.9% English. Tülu/Starling (which use multilingual SFT data) show stronger multilingual gains than Zephyr/Llama Chat (which use English-only SFT data) (§5).",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "Funder conflict not acknowledged",
    407       "detail": "Meta partially funds the research while Llama 2 (a Meta model) is one of the primary models evaluated. This potential conflict of interest is not discussed beyond listing the grant in acknowledgements."
    408     },
    409     {
    410       "flag": "No contamination analysis",
    411       "detail": "The paper evaluates models on benchmarks (TyDiQA published 2020, Belebele 2023) that may overlap with training data of the evaluated models, but no contamination analysis is performed."
    412     },
    413     {
    414       "flag": "No multiple comparison correction",
    415       "detail": "Many significance tests are performed across models, languages, dialects, and countries without correction for multiple comparisons, inflating the risk of false positives."
    416     },
    417     {
    418       "flag": "Observational claims framed as causal",
    419       "detail": "The paper frames alignment effects causally ('alignment improves', 'alignment increases disparity') but uses observational comparisons of existing checkpoints. The authors acknowledge this limitation but the framing throughout the paper remains causal."
    420     },
    421     {
    422       "flag": "Title broader than evidence",
    423       "detail": "The title 'Unintended Impacts of LLM Alignment on Global Representation' implies general findings about LLM alignment, but the study covers only 7B open-source models from two base families (Llama 2 and Mistral, with Qwen/Yi in appendix)."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Training language models to follow instructions with human feedback",
    429       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
    430       "year": 2022,
    431       "relevance": "Foundational RLHF paper that defines the alignment process evaluated in this study; reports demographics of preference annotators."
    432     },
    433     {
    434       "title": "Direct preference optimization: Your language model is secretly a reward model",
    435       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    436       "year": 2023,
    437       "relevance": "Introduces DPO, one of the two main preference tuning algorithms evaluated in this study."
    438     },
    439     {
    440       "title": "Towards measuring the representation of subjective global opinions in language models",
    441       "authors": ["Esin Durmus", "Karina Nyugen", "Thomas I. Liao"],
    442       "year": 2023,
    443       "relevance": "Introduces the GlobalOpinionsQA dataset used in this study to measure LLM agreement with country opinions."
    444     },
    445     {
    446       "title": "Whose opinions do language models reflect?",
    447       "authors": ["Shibani Santurkar", "Esin Durmus", "Faisal Ladhak"],
    448       "year": 2023,
    449       "relevance": "Most similar prior work; studies how RLHF affects political opinions with US demographic groups. This paper extends to global opinions and downstream tasks."
    450     },
    451     {
    452       "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    453       "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"],
    454       "year": 2022,
    455       "relevance": "Defines the helpful/harmless alignment paradigm and demonstrates RLHF can make errors more subtle."
    456     },
    457     {
    458       "title": "Llama 2: Open foundation and fine-tuned chat models",
    459       "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"],
    460       "year": 2023,
    461       "relevance": "One of the two base model families evaluated; provides Llama 2 and Llama 2 Chat as alignment stage comparison points."
    462     },
    463     {
    464       "title": "Camels in a changing climate: Enhancing LM adaptation with Tulu 2",
    465       "authors": ["Hamish Ivison", "Yizhong Wang", "Valentina Pyatkin"],
    466       "year": 2023,
    467       "relevance": "Provides Tülu 2 models with released SFT and DPO checkpoints, enabling stage-by-stage alignment analysis."
    468     },
    469     {
    470       "title": "Discovering language model behaviors with model-written evaluations",
    471       "authors": ["Ethan Perez", "Sam Ringer", "Kamile Lukosiute"],
    472       "year": 2023,
    473       "relevance": "Finds RLHF makes models echo user opinions and express stronger political views, relevant to alignment bias."
    474     },
    475     {
    476       "title": "The history and risks of reinforcement learning and human feedback",
    477       "authors": ["Nathan Lambert", "Thomas Krendl Gilbert", "Tom Zick"],
    478       "year": 2023,
    479       "relevance": "Comprehensive overview of RLHF risks that contextualizes the unintended impacts studied in this paper."
    480     },
    481     {
    482       "title": "A roadmap to pluralistic alignment",
    483       "authors": ["Taylor Sorensen", "Jared Moore", "Jillian Fisher"],
    484       "year": 2024,
    485       "relevance": "Proposes pluralistic alignment approaches relevant to mitigating the US-centric bias demonstrated in this paper."
    486     },
    487     {
    488       "title": "Mistral 7B",
    489       "authors": ["Albert Q. Jiang", "Alexandre Sablayrolles", "Arthur Mensch"],
    490       "year": 2023,
    491       "arxiv_id": "2310.06825",
    492       "relevance": "Second base model family in the study; provides Mistral v0.1 7B and enables comparison across base model architectures."
    493     },
    494     {
    495       "title": "Zephyr: Direct distillation of LM alignment",
    496       "authors": ["Lewis Tunstall", "Edward Beeching", "Nathan Lambert"],
    497       "year": 2023,
    498       "relevance": "Provides one of the evaluated preference-tuned models (Zephyr-7B-beta) using DPO with AI feedback."
    499     }
    500   ],
    501   "engagement_factors": {
    502     "practical_relevance": {
    503       "score": 1,
    504       "justification": "Identifies problems with alignment but does not provide immediately usable tools or techniques for practitioners."
    505     },
    506     "surprise_contrarian": {
    507       "score": 2,
    508       "justification": "Challenges the implicit assumption that alignment is universally beneficial, showing it systematically increases US-centric bias and dialect disparities."
    509     },
    510     "fear_safety": {
    511       "score": 1,
    512       "justification": "Raises bias and fairness concerns about alignment procedures but does not demonstrate novel attacks or existential risks."
    513     },
    514     "drama_conflict": {
    515       "score": 1,
    516       "justification": "Some controversy around alignment creating disparities and US-centric reward models, but the paper presents findings diplomatically."
    517     },
    518     "demo_ability": {
    519       "score": 1,
    520       "justification": "Code and data released on GitHub, but not a tool or demo that practitioners can immediately try."
    521     },
    522     "brand_recognition": {
    523       "score": 1,
    524       "justification": "Stanford and Georgia Tech affiliations are recognizable; evaluates well-known models (Llama 2, Mistral) but not from those companies."
    525     }
    526   }
    527 }

Impressum · Datenschutz