scan.json (31291B)
1 { 2 "paper": { 3 "title": "Exploring Persona-dependent LLM Alignment for the Moral Machine Experiment", 4 "authors": [ 5 "Jiseon Kim", 6 "Jea Kwon", 7 "Luiz Felipe Vecchietti", 8 "Alice Oh", 9 "Meeyoung Cha" 10 ], 11 "year": 2025, 12 "venue": "ICLR 2025", 13 "arxiv_id": "2504.10886", 14 "doi": "10.48550/arXiv.2504.10886" 15 }, 16 "scan_version": 3, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval", "observational"], 19 "key_findings": "LLM moral decisions vary substantially by assigned persona, with political orientation producing the largest shifts — GPT-4o shows MDD=0.48 for conservative vs. progressive personas, far exceeding human variation (MDD=0.12). GPT-4o aligns best with human baselines overall, while GPT-3.5 and Llama2 exhibit ~20% of moral decisions misaligned with human responses. Llama2 shows extremely low valid response rates for certain personas (conservative: 7.2%, religious: 6.0%), raising questions about the reliability of its moral decision analysis under those conditions.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": false, 30 "justification": "The human baseline data from Awad et al. (2018) is public, but the authors' own experimental data — 10,000 generated scenarios and LLM responses across 15 persona conditions × 3 models — is not released." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper mentions using Azure OpenAI API and HuggingFace models but provides no requirements.txt, Dockerfile, or environment specification beyond API/model names." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions are provided. The methodology is described at a conceptual level but lacks concrete runnable instructions." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All results (AMCE values, MDD distances, alignment scores in Table 2) are reported as point estimates with no confidence intervals or error bars." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "No statistical significance tests are applied to any comparison. Differences between personas, between models, and between LLM and human responses are compared solely by point estimates." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "AMCE (Average Marginal Component Effect) is itself an effect size measure from causal inference. MDD values are reported with specific magnitudes (e.g., MDD=0.48 for GPT-4o political persona). Table 2 gives Euclidean distances. These provide magnitude context." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The paper uses 10,000 generated scenarios without justifying why this number was chosen. No power analysis is discussed." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "Results are single-run point estimates. With temperature=1 for GPT models and temperature=0.4 for Llama2, responses are stochastic, yet no variance, standard deviation, or spread across runs is reported." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Two types of baselines are included: (1) a no-persona baseline for each LLM, and (2) human response baselines from Awad et al. (2018) for each demographic subgroup. These are shown in Fig. 2 (gray-shaded radar plots) and Table 2." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "The study includes GPT-4o (2024), a contemporary model. Prior work comparisons reference Takemoto (2024) and Ahmad & Takemoto (2024), both recent. The human baselines from Awad et al. (2018) are the canonical dataset for this framework." 80 }, 81 "ablation_study": { 82 "applies": false, 83 "answer": false, 84 "justification": "There is no multi-component system to ablate. The study examines the effect of persona prompts on LLM responses — the persona IS the experimental variable, not a system component." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Multiple metrics are used: AMCE values (per-scenario effect sizes), MDD (Moral Decision Distance, Euclidean distance between persona AMCE vectors), Euclidean alignment scores (Table 2), and valid response rates (Table 4)." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "No human evaluation of LLM outputs is performed. The comparison to human data uses pre-existing survey responses from Awad et al. (2018), not human judgments of LLM moral reasoning quality." 95 }, 96 "held_out_test_set": { 97 "applies": false, 98 "answer": false, 99 "justification": "No training or tuning is performed in this study. The LLMs are prompted in a zero-shot manner, so the train/test distinction is not applicable." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Extensive breakdowns are provided: by persona category (age, gender, culture, political, education, religion, income) in Figs. 2-4, by moral scenario dimension (9 dimensions) in Figs. 4 and 6, and per-model in Tables 2 and 4." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 4.4 discusses 'moral flips' where LLM preferences reverse from human baselines. Fig. 5 quantifies misalignment percentages. Specific dimensions where preferences flip below 0 are identified (e.g., Social Status under progressive persona for GPT-4o)." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Llama2's extremely low valid response rates for certain personas (conservative: 7.2%, religious: 6.0% in Table 4) are reported. The ~20% misalignment rate for GPT-3.5 and Llama2 is presented as a concerning finding. Llama2's high MDD values across all categories are highlighted." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims (1) moral decisions vary by persona — supported by Figs. 2, 4, 6; (2) greater shifts than humans — supported by Fig. 3 MDD comparisons; (3) partisan sorting phenomenon — supported by political MDD values in Section 4.3. All main claims have corresponding data." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The causal claim is that assigning personas shifts LLM moral decisions. The study design supports this: persona assignment is a controlled manipulation while scenarios are held constant, allowing attribution of decision shifts to the persona condition. This is essentially a within-model experimental design." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The Discussion section extrapolates to 'autonomous vehicles or ethical support systems' and 'real-world applications' based on 3 specific LLMs tested on one moral dilemma framework (trolley problems). The title appropriately scopes to 'the Moral Machine Experiment' but the discussion extends well beyond the tested setting without bounding these extensions." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper interprets results primarily through partisan sorting theory and briefly mentions political sycophancy, but does not consider alternative explanations such as prompt sensitivity artifacts, training data composition effects, RLHF biases, or whether the observed shifts reflect genuine persona modeling versus shallow keyword matching." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper measures LLM text responses to hypothetical trolley problem prompts and frames these as 'moral decisions' with implications for real-world deployment. The gap between responses to simplified moral dilemmas and actual moral behavior in complex real-world scenarios is never acknowledged." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": true, 148 "justification": "Exact model versions are specified in Section 4.1: 'gpt-4o-2024-05-13', 'gpt-3.5-turbo-0613, released in June 2023', and 'Llama-2-7b-chat-hf'." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": false, 153 "justification": "The persona template is provided ('You {persona}. Your responses should closely mirror the knowledge and abilities of this persona.') with fill values in Table 1. However, the actual moral dilemma scenario text sent to the models is not provided — only the generation methodology from Takemoto (2024) is referenced. The full prompts cannot be reconstructed from the paper." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 4.1 reports: GPT models use 'temperature set to 1 and top p set to 1' (Azure OpenAI API defaults); Llama2 uses 'top k equal to 10, top p equal to 0.9, a max length of 512, and temperature set to 0.4.'" 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "No agentic scaffolding is used. The study involves direct prompting of LLMs with persona instructions and scenario descriptions." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "The pipeline is documented: 10,000 scenarios generated via constrained randomization across 9 categories (Section 4.1), valid response filtering described with rates in Table 4, AMCE computation methodology explained in Section 3.2, and culture AMCE weighted-average formula provided." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "The Discussion section (Section 5) contains substantive limitation subsections: 'Improving Persona Settings for LLMs' discusses binary persona simplification and single-prompt methodology limitations, and 'Expanding Moral Machine Scenarios' discusses the narrow scope of the moral dilemma framework." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "Specific threats discussed include: binary personas 'simplifies the complexity of human moral decision-making and the impact of these models on minority groups,' the single prompt template methodology limitation, and the narrow scope of trolley-problem moral dilemmas which 'represents a narrow subset of ethical issues.'" 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": true, 185 "justification": "Section 5 states the study 'primarily used simplified personas' and that the moral machine framework 'predominantly scrutinizes ethical choices related to autonomous vehicles.' The Conclusion acknowledges the need for 'multidimensional constructs' and 'a broader range of contexts.'" 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "Neither the 10,000 generated scenarios nor the LLM response data are released. The human baseline data from Awad et al. (2018) is public, but the authors' own experimental outputs are not available for verification." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section 4.1 describes generating 10,000 scenarios via constrained randomization following Takemoto (2024), prompting 3 models with 14 personas + baseline using specified API settings, and the valid response filtering process (Table 4)." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants were recruited for this study. The human baseline data comes from the public Moral Machine experiment (Awad et al., 2018), a standard benchmark dataset." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "The pipeline is traceable: scenario generation (10,000 via constrained randomization) → persona assignment (14 personas + baseline) → LLM response collection (3 models) → valid response filtering (rates in Table 4) → AMCE computation → MDD calculation. Culture AMCE weighted-average formula is provided." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": true, 214 "justification": "Section 7 (Acknowledgements) states: 'supported by Institute of Information & communications Technology Planning & Evaluation (IITP) grant funded by the Korea government (MSIT) (No.RS-2022-II220184).'" 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are clearly listed: KAIST (Korea) and Max Planck Institute for Security and Privacy (Germany). No authors are affiliated with OpenAI or Meta, whose models are evaluated." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": true, 224 "justification": "The Korean government (MSIT/IITP) is an independent research funder with no commercial stake in whether GPT-4o, GPT-3.5, or Llama2 aligns with human moral judgments." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial interests statement is present in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "No training data cutoff dates are stated for any of the three models. The paper specifies model versions but not when their training data was collected." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": false, 241 "justification": "No discussion of whether the Moral Machine experiment data (published in Nature, 2018) — including its scenarios, AMCE values, and survey results — appeared in the training data of GPT-4o, GPT-3.5, or Llama2." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": false, 246 "justification": "The Moral Machine experiment is a high-profile 2018 Nature publication. All three models were trained well after 2018 and likely encountered this data. The scenario generation follows Takemoto (2024), but whether the moral dilemma framework itself is contaminated in the models is never discussed." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "This paper does not conduct a human subjects study. It uses pre-existing human data from the public Moral Machine dataset (Awad et al., 2018) as baselines." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants were recruited. The study queries LLMs and compares to pre-existing public survey data." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants were recruited for this study. The demographics of the original Moral Machine participants are described in Appendix A.2 as context for the baseline data." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants were recruited. The study uses pre-existing public data." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants were recruited or assigned to conditions." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants were involved in this study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants were recruited for this study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": false, 290 "justification": "The study queries 3 LLMs across 10,000 scenarios × 15 persona conditions (150,000+ API calls per model). No cost, token usage, or latency information is reported." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "No mention of total computational budget, GPU hours for Llama2 inference, API costs for GPT models, or wall-clock time for the experiments." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "Temperature=1 for GPT models introduces substantial stochasticity in responses. No multiple-seed or multiple-run analysis is reported." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": false, 307 "justification": "The paper does not state how many times each scenario was run per model-persona combination. Results appear to be from a single run." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "Hyperparameters are adopted from Takemoto (2024) for Llama2 and defaults for GPT models. No search budget is reported, and no justification is given for why these specific settings are optimal for this study." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "Model selection is justified in Section 4.1: 'We select three top-performing models from their LLM series based on their strong alignment with human responses reported in Takemoto (2024), and their high valid response rate.'" 318 }, 319 "multiple_comparison_correction": { 320 "applies": true, 321 "answer": false, 322 "justification": "The study makes many comparisons across 7 persona categories × 2 groups × 9 moral dimensions × 3 models. No multiple comparison correction is applied — in fact, no statistical tests are performed at all." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": false, 326 "answer": false, 327 "justification": "The authors do not propose a new system. They evaluate existing public models (GPT-4o, GPT-3.5, Llama2) against existing public human data, so self-comparison bias does not apply." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": false, 331 "answer": false, 332 "justification": "The study is not comparing competing methods at different compute budgets. It examines behavioral patterns across persona conditions." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": true, 337 "justification": "Section 5 ('Expanding Moral Machine Scenarios') explicitly discusses construct validity: the Moral Machine 'predominantly scrutinizes ethical choices related to autonomous vehicles, which represents a narrow subset of ethical issues' and calls for expanding to 'other types of moral decisions, such as allocation of healthcare resources, judicial decisions, or climate change policies.'" 338 }, 339 "scaffold_confound_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "No scaffolding is used. Models are directly prompted with persona instructions and scenario descriptions." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "The Moral Machine experiment (2018) and all its data predate the training of all three models. Whether models have memorized patterns from this widely-cited Nature paper is not discussed." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of whether the structured moral dilemma scenarios leak information about expected responses. The moral machine framework's binary choice structure may cue socially desirable responses." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether the 10,000 generated scenarios share structural dependencies (constrained randomization could produce correlated scenarios) or whether model responses across scenarios are independent." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference tests, or contamination analysis." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "LLM moral decisions vary substantially by persona, with greater shifts than humans even in critical dilemmas.", 371 "evidence": "Fig. 3 shows LLM MDD values consistently exceed human MDD values across all persona categories. GPT-4o MDD ranges from 0.08 (gender) to 0.48 (political) vs. human range of 0.06-0.33. Fig. 4 shows LLM preferences cross zero in multiple dimensions while human preferences never do.", 372 "supported": "strong" 373 }, 374 { 375 "claim": "Political persona produces the largest shift in LLM moral decisions — a partisan sorting effect.", 376 "evidence": "Fig. 3 and Section 4.3: GPT-4o political MDD=0.48 (vs. next highest age=0.13), GPT-3.5 political MDD=0.81, Llama2 political MDD=0.53. Human political MDD=0.12, among the lowest for humans. This inversion is the core finding.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "GPT-4o demonstrates the strongest alignment with human moral responses overall.", 381 "evidence": "Table 2 shows GPT-4o achieves the lowest Euclidean distance (best alignment) in 7/14 persona conditions. GPT-3.5 also achieves 7/14 but with a higher baseline distance. Llama2 achieves 0/14.", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Nearly one-fifth of all LLM decisions for GPT-3.5 and Llama2 are misaligned with human responses.", 386 "evidence": "Fig. 5 shows approximately 20% moral decision misalignment for GPT-3.5 and Llama2, while GPT-4o shows a lower percentage. However, exact percentages and uncertainty bounds are not provided in the text.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "Persona-dependent prompts improve GPT-3.5 alignment with corresponding human demographics in 13/14 cases.", 391 "evidence": "Section 4.2 states this claim directly, comparing Table 2 persona columns to the baseline column. Most persona distances for GPT-3.5 are lower than the baseline 1.015.", 392 "supported": "moderate" 393 } 394 ], 395 "red_flags": [ 396 { 397 "flag": "No statistical significance testing", 398 "detail": "All comparisons across personas, models, and scenarios are based on point estimates (AMCE, MDD, Euclidean distance). No confidence intervals, significance tests, or uncertainty quantification is provided for any result. With stochastic LLM outputs (temperature=1 for GPT models), observed differences could reflect sampling noise." 399 }, 400 { 401 "flag": "Single-run results with stochastic models", 402 "detail": "Temperature=1 for GPT models means substantial response variability, yet results appear to be from a single run. Without multiple runs and variance reporting, the stability of reported MDD values and AMCE patterns is unknown." 403 }, 404 { 405 "flag": "Unreliable Llama2 results due to very low valid response rates", 406 "detail": "Table 4 shows Llama2 valid response rates as low as 6.0% (religious persona), 7.2% (conservative), 11.4% (male), and 17.1% (female). Computing AMCE values from <10% of responses produces unreliable estimates, yet these are treated equally with high-response-rate conditions in figures and analysis." 407 }, 408 { 409 "flag": "No contamination analysis for widely-known benchmark", 410 "detail": "The Moral Machine experiment was published in Nature (2018) with massive public engagement. All three models trained after 2018 likely encountered this data, including the AMCE results and moral dilemma framework. The paper never discusses whether observed LLM behavior reflects genuine moral reasoning vs. memorized patterns from training data." 411 }, 412 { 413 "flag": "Narrow model sample for broad claims", 414 "detail": "Only 3 models are tested (2 from the same GPT family, 1 small open-source model). Claims about 'LLMs' broadly and implications for 'deploying these models' extend well beyond this limited sample, which does not include other major model families (Claude, Gemini, Mistral, etc.)." 415 } 416 ], 417 "cited_papers": [ 418 { 419 "title": "Large-scale moral machine experiment on large language models", 420 "authors": ["Muhammad Shahrul Zaim bin Ahmad", "Kazuhiro Takemoto"], 421 "year": 2024, 422 "arxiv_id": "2411.06790", 423 "relevance": "Directly relevant — evaluates LLM alignment with human moral judgments at scale across commercial and open-source models." 424 }, 425 { 426 "title": "The moral machine experiment on large language models", 427 "authors": ["Kazuhiro Takemoto"], 428 "year": 2024, 429 "relevance": "Foundation work for this paper — tests LLM behavior on Moral Machine scenarios and provides the experimental methodology adopted here." 430 }, 431 { 432 "title": "Language model alignment in multilingual trolley problems", 433 "authors": ["Zhijing Jin", "Max Kleiman-Weiner", "Giorgio Piatti"], 434 "year": 2024, 435 "arxiv_id": "2407.02273", 436 "relevance": "Studies LLM cultural bias through multilingual moral dilemma scenarios, closely related to the persona-cultural alignment examined here." 437 }, 438 { 439 "title": "Bias runs deep: Implicit reasoning biases in persona-assigned LLMs", 440 "authors": ["Shashank Gupta", "Vaishnavi Shrivastava", "Ameet Deshpande"], 441 "year": 2024, 442 "relevance": "Demonstrates how persona assignment introduces reasoning biases in LLMs — directly informs the persona template methodology used in this paper." 443 }, 444 { 445 "title": "Moral foundations of large language models", 446 "authors": ["Marwa Abdulhai", "Gregory Serapio-Garcia", "Clément Crepy"], 447 "year": 2023, 448 "arxiv_id": "2310.15337", 449 "relevance": "Explores political bias in LLMs through moral foundations theory, finding authority emphasis among conservative personas — directly corroborated by this paper's findings." 450 }, 451 { 452 "title": "Moral mimicry: Large language models produce moral rationalizations tailored to political identity", 453 "authors": ["Gabriel Simmons"], 454 "year": 2022, 455 "arxiv_id": "2209.12106", 456 "relevance": "Demonstrates LLMs generate politically-biased text when assigned political identities, directly relevant to the partisan sorting phenomenon found here." 457 }, 458 { 459 "title": "GPT-4 technical report", 460 "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"], 461 "year": 2023, 462 "arxiv_id": "2303.08774", 463 "relevance": "Technical report for GPT-4, the model family whose alignment properties are a central subject of evaluation in this paper." 464 }, 465 { 466 "title": "Llama 2: Open foundation and fine-tuned chat models", 467 "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"], 468 "year": 2023, 469 "arxiv_id": "2307.09288", 470 "relevance": "Describes the Llama 2 model family and its RLHF training methodology, relevant to understanding why Llama2 shows different alignment patterns." 471 }, 472 { 473 "title": "GPT-4o system card", 474 "authors": ["Aaron Hurst", "Adam Lerer", "Adam P Goucher"], 475 "year": 2024, 476 "arxiv_id": "2410.21276", 477 "relevance": "System card for GPT-4o detailing its safety properties and alignment training, directly relevant to the model's moral decision behavior evaluated here." 478 }, 479 { 480 "title": "From persona to personalization: A survey on role-playing language agents", 481 "authors": ["Jiangjie Chen", "Xintao Wang", "Rui Xu"], 482 "year": 2024, 483 "arxiv_id": "2404.18231", 484 "relevance": "Comprehensive survey of persona modeling in LLMs, providing context for how persona prompting affects LLM behavior." 485 }, 486 { 487 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 488 "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"], 489 "year": 2025, 490 "arxiv_id": "2501.12948", 491 "relevance": "Describes RL-based reasoning training for LLMs, relevant to understanding how training methodology affects model alignment behavior." 492 } 493 ], 494 "engagement_factors": { 495 "practical_relevance": { 496 "score": 1, 497 "justification": "Provides insights for anyone deploying LLMs in persona-sensitive contexts, but offers no tool, technique, or concrete mitigation strategy." 498 }, 499 "surprise_contrarian": { 500 "score": 2, 501 "justification": "The finding that LLMs are far more sensitive to political personas than humans — inverting the human pattern where politics barely affects moral decisions — is surprising and counterintuitive." 502 }, 503 "fear_safety": { 504 "score": 1, 505 "justification": "Raises concerns about persona-driven bias in morally sensitive applications, but does not demonstrate a novel attack or existential risk." 506 }, 507 "drama_conflict": { 508 "score": 1, 509 "justification": "The political sycophancy finding has some controversy potential, but the paper is measured in tone and focused on an academic framework." 510 }, 511 "demo_ability": { 512 "score": 0, 513 "justification": "No code, demo, or reproducible artifact is released." 514 }, 515 "brand_recognition": { 516 "score": 2, 517 "justification": "Evaluates GPT-4o and Llama2 — well-known models — and is published at ICLR 2025, a top venue." 518 } 519 } 520 }