scan.json (31619B)
1 { 2 "paper": { 3 "title": "Modular Pluralism: Pluralistic Alignment via Multi-LLM Collaboration", 4 "authors": [ 5 "Shangbin Feng", 6 "Taylor Sorensen", 7 "Yuhan Liu", 8 "Jillian Fisher", 9 "Chan Young Park", 10 "Yejin Choi", 11 "Yulia Tsvetkov" 12 ], 13 "year": 2024, 14 "venue": "Conference on Empirical Methods in Natural Language Processing", 15 "arxiv_id": "2406.15951", 16 "doi": "10.48550/arXiv.2406.15951" 17 }, 18 "scan_version": 3, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval"], 21 "key_findings": "Modular Pluralism augments black-box LLMs with a pool of smaller community LMs fine-tuned on community-specific corpora, improving pluralistic alignment across Overton (up to 50.3% better value coverage), steerable (up to 23.8% on balanced accuracy), and distributional (14.9% J-S distance reduction) objectives. The modular design allows patching underrepresented communities by adding new community LMs, with 5-7% improvements for Asian/African countries. Aligned LLMs benefit more than unaligned ones, and larger models show greater improvements (87.9% for 70B vs 17.2% average for 7B).", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "Footnote 1 states: 'Code and data are publicly available at https://github.com/BunsenFeng/modular_pluralism.' A working URL is provided." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The paper uses publicly available datasets (Value Kaleidoscope, OpinionQA, MoralChoice, GlobalOpinionQA) and states code and data are publicly available at the GitHub repository." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "Implementation details are provided (LoRA, 16-bit precision, model checkpoints) but no requirements.txt, Dockerfile, or dependency specification is mentioned in the paper." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper provides implementation details in Appendix B (hyperparameters, model checkpoints, dataset sampling) but no step-by-step reproduction instructions or scripts to replicate experiments are described." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "All results in Tables 1-3, Figures 2-4, and appendix tables are reported as point estimates with no confidence intervals, error bars, or ± notation." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper makes many comparative claims ('outperforms baselines by up to 23.8%') but no statistical significance tests (p-values, t-tests, etc.) are reported anywhere. The only inter-rater statistic is Fleiss' Kappa for human annotators." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Percentage improvements are reported with baseline context throughout (e.g., 'improving over baselines by up to 23.8% and 21.8% on balanced accuracy and Macro-F1 scores'), and full baseline and method numbers are provided in all tables, allowing effect size computation." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "Dataset sizes are stated (3,132 situations from VK, 22,378 from OpinionQA, 28,763 from GlobalOpinionQA) but no power analysis or sample size justification is provided. The 100 pairs for human evaluation and 5 annotators are given without justification." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. All tables show single-run point estimates." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Three baselines are consistently compared: vanilla (direct prompting), prompting (pluralism-inducing prompt prefix), and MoE (mixture-of-experts routing to most fitting community LM). Section 3 describes all baselines." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "The baselines are appropriate for this novel framework: vanilla LLM prompting, instruction-based pluralism induction, and MoE-style routing are all reasonable contemporary approaches. No prior work exists for this exact problem formulation." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "Section 5 includes component analyses: Table 4 compares perspective vs. cultural vs. mixed community LM pools, Figure 6 tests adding individual community LMs for patching, Table 7 analyzes model size effects, and Figure 5 measures message faithfulness." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "Multiple metrics are used: Accuracy, Balanced Accuracy, Macro-F1 for steerable tasks; Jensen-Shannon distance for distributional tasks; NLI-based value coverage for Overton; plus human evaluation and GPT-4-as-judge evaluation." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": true, 96 "justification": "Section 3 (task 2) describes human evaluation: 5 annotators evaluate 100 response pairs for Overton pluralism, with Fleiss' Kappa of 0.4678 reported. Results shown in Figure 3." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": false, 101 "justification": "No explicit dev/test split is described. Datasets are randomly sampled and evaluated directly. For LLaMA2-70B, 20% is randomly sampled 'due to computing constraints' but this is not a held-out test set. No mention of using a separate validation set for prompt or method tuning." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Table 2 breaks down OpinionQA results by 8 socio-political categories (party, ideology, religion, race, education, income, region, sex). Table 3 breaks down GlobalOpinionQA by 7 countries. Figure 4 separates low- and high-ambiguity MoralChoice scenarios." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": false, 111 "justification": "The paper shows only positive working examples in the appendix (Figures 7-10). While Table 4 shows cultural LMs hurt OpinionQA performance, no qualitative failure analysis or error examples are provided." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "Table 4 shows cultural community LMs hurt US-centric OpinionQA performance. Table 7 shows -10.6% on Overton for 7B model. Table 8 shows some models where prompting outperforms Modular Pluralism. These negative findings are discussed honestly." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "Abstract claims about improving Overton (68.5%), steerable (26.6% and 10.4%), and distributional (10.9%) pluralism are supported by results in Figures 2-4 and Tables 1-3. The claim about patching underrepresented communities is supported by Figure 6." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "Causal claims ('Modular Pluralism improves/advances pluralistic alignment') are supported by controlled comparisons: same base LLMs evaluated with different methods (vanilla, prompting, MoE, their approach). The component analyses in Table 4 and Figure 6 provide additional causal evidence by varying specific system components." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper tests across 6 LLMs, 4 datasets, and 6 tasks, and explicitly acknowledges limitations about Western-centric community corpora, specific community definitions, and specific evaluation schemes. The Limitations section bounds scope to perspective- and culture-informed communities." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper does not discuss whether improvements could be explained by alternative factors such as increased prompt length/context (community LM comments add substantial text), extra compute, or simply providing more diverse text as context. The entropy analysis (Table 6) partially addresses one confound but is insufficient." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": false, 143 "justification": "NLI entailment scores are used as a proxy for 'value coverage,' J-S distance for 'distributional alignment,' and classification accuracy for 'steerability,' but the paper does not discuss the gap between these proxies and the actual pluralism constructs they claim to measure." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": true, 150 "justification": "Appendix B specifies exact model checkpoint names: meta-llama/Llama-2-13b-hf, meta-llama/Llama-2-13b-chat-hf, davinci-002, gpt-3.5-turbo, mistralai/Mistral-7B-Instruct-v0.1, google/gemma-7b, etc." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": true, 155 "justification": "Actual prompt text is provided in Section 2 ('Please comment on a given situation with the help of the following passages,' 'Which of the following comments best reflect <attribute>?') and Appendix B ('Make sure your response reflects diverse values and perspectives,' 'Please evaluate which of the two responses better reflects pluralistic values')." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Appendix B reports: block size 128, batch size 64, learning rate 1e-6, weight decay 1e-2, 1 training epoch, 16-bit precision, LoRA with default hyperparameters, greedy decoding by default, temperature τ=1 for sampling, max 512 new tokens." 161 }, 162 "scaffolding_described": { 163 "applies": false, 164 "answer": false, 165 "justification": "No agentic scaffolding is used. The multi-LLM collaboration is a straightforward pipeline (community LMs generate comments, then the main LLM processes them) without tool use, retry logic, or agent loops." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Appendix B describes dataset sampling procedures (3,132 situations from VK, 22,378 from OpinionQA, 28,763 from GlobalOpinionQA), evaluation protocols for each task, and community LM training data sources (Feng et al. 2023 perspective corpora, CultureBank)." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "A dedicated 'Limitations' section discusses three specific limitations: narrow community definitions, computational overhead, and evaluation scope. An 'Ethics Statement' section adds further caveats about dual-use risks." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": true, 182 "justification": "Limitations are specific to this study: community LMs only cover perspective- and culture-informed communities, computational costs of running 6 additional 7B models, reliance on 4 datasets with specific evaluation schemes, and challenges of collecting community-representative corpora including intersectional communities." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": true, 187 "justification": "The paper explicitly states what was not tested: 'we mainly considered perspective-informed and culture-informed communities, while pluralistic alignment could be equally important for other definitions of community.' West-centric data bias is acknowledged, and evaluation is bounded to values, cultures, and perspectives." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": true, 194 "justification": "The paper states 'Code and data are publicly available at https://github.com/BunsenFeng/modular_pluralism' and uses publicly available datasets (VK, OpinionQA, MoralChoice, GlobalOpinionQA)." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Community LM training corpora are described as 'six perspective-laden corpora in Feng et al. (2023) featuring left/center/right-learning news and social media documents.' Cultural LMs use CultureBank (Shi et al. 2024) partitioned by continent. Evaluation dataset sources and sampling are described in Appendix B." 200 }, 201 "recruitment_methods_described": { 202 "applies": true, 203 "answer": false, 204 "justification": "Five human annotators are used for evaluation (Figure 3) but their recruitment method, qualifications, and selection criteria are not described." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "The pipeline is documented: community corpora → LoRA fine-tuning → community LMs → comment generation → LLM processing (summarization/selection/aggregation depending on pluralism mode). Appendix B details each evaluation task's data pipeline." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": true, 216 "justification": "Acknowledgements section lists NSF CAREER Grant IIS2142739, NSF grants IIS2125201 and IIS2203097, DARPA ITM program FA8650-23-C-7316, Office of Naval Research N00014-24-1-2207, and gift funding from OpenAI." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are listed: University of Washington and New York University. Authors are not affiliated with companies whose products are evaluated." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "OpenAI provides 'gift funding' while ChatGPT (gpt-3.5-turbo/davinci-002) is one of the six LLMs evaluated. OpenAI has a financial interest in positive evaluations of its products." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial interests statement is present in the paper. The OpenAI funding is disclosed in acknowledgements but no formal conflict-of-interest declaration exists." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "No training data cutoff dates are stated for any of the six LLMs used (LLaMA2, ChatGPT, LLaMA3, Gemma). These models are evaluated on public benchmarks (VK, OpinionQA, MoralChoice, GlobalOpinionQA) that could have been in their training data." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": false, 243 "justification": "No discussion of whether evaluation dataset examples (survey questions from OpinionQA, GlobalOpinionQA, MoralChoice scenarios, VK situations) appeared in any model's training data." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": false, 248 "justification": "All four evaluation datasets (VK, OpinionQA, MoralChoice, GlobalOpinionQA) were publicly available before or around the models' training periods. No contamination analysis is performed." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "The paper does not conduct a human subjects study. The 5 annotators serve as evaluators of system outputs for quality assessment, not as study participants." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human subjects study is conducted. Annotators evaluate system outputs rather than being studied themselves." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human subjects study. Annotators are evaluators, not participants." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human subjects study. Annotators are used for evaluation only." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No experimental human subjects study is conducted." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No experimental human subjects study is conducted." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human subjects study. Annotators evaluate outputs." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "A theoretical compute overhead estimate is provided ('adds only (6*7)/405=10.4% compute') but no measured inference cost, latency, API costs, or tokens consumed are reported." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "Training details are given (1 epoch, LoRA fine-tuning) but total GPU hours, training time, or total API spend are not quantified. The 20% subsample for LLaMA2-70B 'due to computing constraints' suggests significant cost but this is not quantified." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "No mention of multiple random seeds. All results appear to be from single runs. Community LM fine-tuning and all evaluations show single-run numbers." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": false, 309 "justification": "The number of experimental runs is never stated. Results are presented without indicating how many runs produced them." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "Fixed hyperparameters are used (learning rate 1e-6, etc.) but no search budget, search method, or justification for why these values were chosen is provided." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": false, 319 "justification": "The paper uses default LoRA hyperparameters and fixed prompts without explaining how these were selected or whether alternatives were tried." 320 }, 321 "multiple_comparison_correction": { 322 "applies": true, 323 "answer": false, 324 "justification": "Many comparisons are made across 6 models, 4 datasets, 6 tasks, and multiple settings, but no statistical tests are performed at all, let alone corrections for multiple comparisons." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The authors implement all baselines (vanilla, prompting, MoE) and their own method, but do not acknowledge the bias of evaluating their own system against their own baseline implementations." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": false, 334 "justification": "Modular Pluralism uses substantially more compute than baselines (6 extra 7B models) but performance is not reported as a function of matched compute budgets. The theoretical 10.4% overhead estimate does not substitute for actual compute-controlled comparison." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "The paper uses NLI-based value coverage as a proxy for Overton pluralism, J-S distance for distributional pluralism, and classification accuracy for steerability, but does not discuss whether these benchmarks and metrics actually measure the claimed pluralism constructs." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": true, 343 "answer": true, 344 "justification": "The paper consistently compares different methods (vanilla, prompting, MoE, Modular Pluralism) using the same base LLMs, isolating the framework effect. Results are reported for 6 different LLMs, and the scaffold/framework is the controlled independent variable." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "No discussion of temporal leakage. The evaluation datasets (VK, OpinionQA, MoralChoice, GlobalOpinionQA) were published before or around the training of the models used, but this is not addressed." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of whether the evaluation setup leaks information. Community LM comments are provided as context to the LLM, but whether this introduces feature leakage for the specific tasks is not discussed." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No discussion of whether training data for models overlaps with evaluation data. The community LMs are fine-tuned on news/social media data that may overlap with the evaluation datasets." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, temporal splits, or decontamination pipelines are used." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "Modular Pluralism improves Overton pluralism value coverage by up to 50.3% over the strongest baseline.", 373 "evidence": "Figure 2 shows NLI-based value coverage: aligned ChatGPT improves from 22.8% (MoE) to 29.0% (Ours), and aligned LLaMA2-13B from 23.0% (MoE) to 29.4%. Confirmed by alternative NLI model WANLI in Table 5.", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "Modular Pluralism achieves 45.8% and 16.5% higher win rates than the strongest baseline in human and GPT-4 Overton evaluation.", 378 "evidence": "Figure 3 shows win/tie/lose rates against MoE, prompting, and vanilla baselines with ChatGPT. Against prompting (strongest baseline), win rates are 62.9% (human) and 46.0% (GPT-4).", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "Modular Pluralism improves steerability by up to 23.8% on balanced accuracy for value classification.", 383 "evidence": "Table 1 shows three-way Value Kaleidoscope results: aligned ChatGPT achieves 68.7% BAcc vs 55.5% for aligned prompting, a 23.8% improvement. Results consistent across binary/three-way settings and both models.", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "Modular Pluralism reduces distributional distance to human populations by at least 10.9% on GlobalOpinionQA.", 388 "evidence": "Table 3 shows J-S distance reductions across 7 countries. Unaligned LLaMA2-13B: 0.295 (MoE) to 0.274 (Ours), a 7.1% reduction. Unaligned ChatGPT: 0.335 (prompting) to 0.274 (Ours), an 18.2% reduction. Average reduction 14.9%.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "LLMs are generally faithful to community LM inputs with 51.2% average coverage rate.", 393 "evidence": "Figure 5 shows NLI-based entailment analysis across 5 LLMs and 6 community LMs. Social media community LMs have higher coverage (57.7%) than news (44.7%). LLMs also add 33.8% new content.", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "Adding new community LMs can patch underrepresented communities, improving J-S distance by 5.2-6.7%.", 398 "evidence": "Figure 6 shows adding an Asian culture community LM improves Japan alignment (from ~0.30 to ~0.285) and an African culture LM improves Nigeria/Kenya alignment, without degrading other communities.", 399 "supported": "weak" 400 } 401 ], 402 "red_flags": [ 403 { 404 "flag": "No error bars or statistical tests", 405 "detail": "All results across 6 tasks, 6 models, and 4 datasets are reported as point estimates from apparent single runs. No confidence intervals, significance tests, or variance measures are provided despite many comparative claims." 406 }, 407 { 408 "flag": "Non-independent funder", 409 "detail": "OpenAI provides gift funding while ChatGPT is one of the primary evaluation models. The conflict is not explicitly acknowledged beyond listing OpenAI in acknowledgements." 410 }, 411 { 412 "flag": "Small human evaluation sample", 413 "detail": "Human evaluation uses only 5 annotators on 100 response pairs with moderate agreement (Fleiss' Kappa 0.4678). No annotator demographics or recruitment details are provided." 414 }, 415 { 416 "flag": "No contamination analysis", 417 "detail": "All evaluation datasets were publicly available around or before the models' training periods, yet no contamination analysis is performed. This could differentially affect baselines and the proposed method." 418 }, 419 { 420 "flag": "Compute confound unaddressed", 421 "detail": "Modular Pluralism uses 6 additional 7B community LMs at inference time, providing substantially more context to the main LLM. The improvement could partially be attributed to longer/richer context rather than the specific community specialization. No ablation separates these effects." 422 } 423 ], 424 "cited_papers": [ 425 { 426 "title": "Constitutional AI: Harmlessness from AI Feedback", 427 "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu"], 428 "year": 2022, 429 "arxiv_id": "2212.08073", 430 "relevance": "Foundational alignment approach using AI feedback, directly relevant to alignment methodology evaluation." 431 }, 432 { 433 "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", 434 "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"], 435 "year": 2024, 436 "relevance": "Major alignment method (DPO) that this paper builds upon and contrasts with for pluralistic alignment." 437 }, 438 { 439 "title": "Training Language Models to Follow Instructions with Human Feedback", 440 "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"], 441 "year": 2022, 442 "relevance": "InstructGPT/RLHF foundational paper for LLM alignment with human preferences." 443 }, 444 { 445 "title": "MaxMin-RLHF: Towards Equitable Alignment of Large Language Models with Diverse Human Preferences", 446 "authors": ["Souradip Chakraborty", "Jiahao Qiu", "Hui Yuan"], 447 "year": 2024, 448 "arxiv_id": "2402.08925", 449 "relevance": "Directly addresses equitable alignment with diverse preferences, a key comparison point for pluralistic alignment methods." 450 }, 451 { 452 "title": "A Roadmap to Pluralistic Alignment", 453 "authors": ["Taylor Sorensen", "Jared Moore", "Jillian Fisher"], 454 "year": 2024, 455 "arxiv_id": "2402.05070", 456 "relevance": "Defines the three pluralism objectives (Overton, steerable, distributional) that this paper operationalizes and evaluates." 457 }, 458 { 459 "title": "Knowledge Card: Filling LLMs' Knowledge Gaps with Plug-in Specialized Language Models", 460 "authors": ["Shangbin Feng", "Weijia Shi", "Yuyang Bai"], 461 "year": 2024, 462 "relevance": "Prior multi-LLM collaboration framework that Modular Pluralism builds upon for decoding-time collaboration." 463 }, 464 { 465 "title": "DExperts: Decoding-time Controlled Text Generation with Experts and Anti-experts", 466 "authors": ["Alisa Liu", "Maarten Sap", "Ximing Lu"], 467 "year": 2021, 468 "relevance": "Decoding-time control method relevant to understanding controlled generation approaches for LLM alignment." 469 }, 470 { 471 "title": "Whose Opinions Do Language Models Reflect?", 472 "authors": ["Shibani Santurkar", "Esin Durmus", "Faisal Ladhak"], 473 "year": 2023, 474 "relevance": "Demonstrates that LLMs reflect narrow demographic opinions, motivating pluralistic alignment and providing the OpinionQA dataset." 475 }, 476 { 477 "title": "Evaluating the Moral Beliefs Encoded in LLMs", 478 "authors": ["Nino Scherrer", "Claudia Shi", "Amir Feder"], 479 "year": 2024, 480 "relevance": "Provides the MoralChoice dataset used for distributional pluralism evaluation, assessing moral reasoning in LLMs." 481 }, 482 { 483 "title": "Towards Measuring the Representation of Subjective Global Opinions in Language Models", 484 "authors": ["Esin Durmus", "Karina Nyugen", "Thomas I Liao"], 485 "year": 2023, 486 "arxiv_id": "2306.16388", 487 "relevance": "Provides the GlobalOpinionQA dataset and demonstrates LLM alignment gaps with global opinion distributions." 488 }, 489 { 490 "title": "Personalized Soups: Personalized Large Language Model Alignment via Post-hoc Parameter Merging", 491 "authors": ["Joel Jang", "Seungone Kim", "Bill Yuchen Lin"], 492 "year": 2023, 493 "arxiv_id": "2310.11564", 494 "relevance": "Alternative approach to personalized/pluralistic alignment through parameter merging, requiring white-box access unlike Modular Pluralism." 495 }, 496 { 497 "title": "Proximal Policy Optimization Algorithms", 498 "authors": ["John Schulman", "Filip Wolski", "Prafulla Dhariwal"], 499 "year": 2017, 500 "arxiv_id": "1707.06347", 501 "relevance": "Core RL algorithm used in RLHF alignment pipelines that this paper's approach provides an alternative to." 502 } 503 ], 504 "engagement_factors": { 505 "practical_relevance": { 506 "score": 2, 507 "justification": "Framework could be adopted by practitioners deploying LLMs to diverse user populations, though requires training community LMs as a prerequisite." 508 }, 509 "surprise_contrarian": { 510 "score": 1, 511 "justification": "Builds on known issues with alignment monoculture; the idea that smaller specialized models can complement large LLMs is incremental rather than surprising." 512 }, 513 "fear_safety": { 514 "score": 1, 515 "justification": "Ethics statement mentions dual-use risk of representing hateful communities, but this is not the paper's focus." 516 }, 517 "drama_conflict": { 518 "score": 0, 519 "justification": "No controversy, no challenge to any specific company or widely-held belief." 520 }, 521 "demo_ability": { 522 "score": 2, 523 "justification": "Code and data released at GitHub (https://github.com/BunsenFeng/modular_pluralism), though running the full system requires multiple model fine-tuning steps." 524 }, 525 "brand_recognition": { 526 "score": 1, 527 "justification": "University of Washington and NYU affiliations with known researchers (Yejin Choi, Yulia Tsvetkov), but not a major industry lab." 528 } 529 } 530 }