scan-v4.json (35301B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Exploring Persona-dependent LLM Alignment for the Moral Machine Experiment", 6 "authors": [ 7 "Jiseon Kim", 8 "Jea Kwon", 9 "Luiz Felipe Vecchietti", 10 "Alice Oh", 11 "Meeyoung Cha" 12 ], 13 "year": 2025, 14 "venue": "ICLR", 15 "arxiv_id": "2504.10886", 16 "doi": "10.48550/arXiv.2504.10886" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "The abstract claims (1) moral decisions vary by persona — supported by Figs. 2, 4, 6; (2) greater shifts than humans — supported by Fig. 3 MDD comparisons; (3) partisan sorting phenomenon — supported by political MDD values in Section 4.3. All main claims have corresponding data.", 24 "source": "opus" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "The causal claim is that assigning personas shifts LLM moral decisions. The study design supports this: persona assignment is a controlled manipulation while scenarios are held constant, allowing attribution of decision shifts to the persona condition. This is essentially a within-model experimental design.", 30 "source": "opus" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The Discussion section extrapolates to 'autonomous vehicles or ethical support systems' and 'real-world applications' based on 3 specific LLMs tested on one moral dilemma framework (trolley problems). The title appropriately scopes to 'the Moral Machine Experiment' but the discussion extends well beyond the tested setting without bounding these extensions.", 36 "source": "opus" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper interprets results primarily through partisan sorting theory and briefly mentions political sycophancy, but does not consider alternative explanations such as prompt sensitivity artifacts, training data composition effects, RLHF biases, or whether the observed shifts reflect genuine persona modeling versus shallow keyword matching.", 42 "source": "opus" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper measures LLM text responses to hypothetical trolley problem prompts and frames these as 'moral decisions' with implications for real-world deployment. The gap between responses to simplified moral dilemmas and actual moral behavior in complex real-world scenarios is never acknowledged.", 48 "source": "opus" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "The Discussion section (Section 5) contains substantive limitation subsections: 'Improving Persona Settings for LLMs' discusses binary persona simplification and single-prompt methodology limitations, and 'Expanding Moral Machine Scenarios' discusses the narrow scope of the moral dilemma framework.", 56 "source": "opus" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Specific threats discussed include: binary personas 'simplifies the complexity of human moral decision-making and the impact of these models on minority groups,' the single prompt template methodology limitation, and the narrow scope of trolley-problem moral dilemmas which 'represents a narrow subset of ethical issues.'", 62 "source": "opus" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "Section 5 states the study 'primarily used simplified personas' and that the moral machine framework 'predominantly scrutinizes ethical choices related to autonomous vehicles.' The Conclusion acknowledges the need for 'multidimensional constructs' and 'a broader range of contexts.'", 68 "source": "opus" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Section 7 (Acknowledgements) states: 'supported by Institute of Information & communications Technology Planning & Evaluation (IITP) grant funded by the Korea government (MSIT) (No.RS-2022-II220184).'", 76 "source": "opus" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are clearly listed: KAIST (Korea) and Max Planck Institute for Security and Privacy (Germany). No authors are affiliated with OpenAI or Meta, whose models are evaluated.", 82 "source": "opus" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": true, 87 "justification": "The Korean government (MSIT/IITP) is an independent research funder with no commercial stake in whether GPT-4o, GPT-3.5, or Llama2 aligns with human moral judgments.", 88 "source": "opus" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial interests statement is present in the paper.", 94 "source": "opus" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Key terms defined: 'Moral Machine Experiment' (referenced to Awad et al. 2018), AMCE explained with range and interpretation, 'alignment' operationalized as distance between LLM and human AMCE vectors.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Three contributions explicitly stated: (1) evidence of persona influence on LLM decisions, (2) MDD metric for measuring alignment gaps, (3) ethical risks and partisan sorting discussion. Reader knows what paper adds.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Related work section (2) engages substantively: shows prior work examined aggregate-level patterns (Takemoto, Ahmad), language effects (Jin), but 'lack of research exploring how context-dependent factors...influence these LLM decisions.'", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": false, 124 "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper.", 125 "source": "opus" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": false, 130 "justification": "The human baseline data from Awad et al. (2018) is public, but the authors' own experimental data — 10,000 generated scenarios and LLM responses across 15 persona conditions × 3 models — is not released.", 131 "source": "opus" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper mentions using Azure OpenAI API and HuggingFace models but provides no requirements.txt, Dockerfile, or environment specification beyond API/model names.", 137 "source": "opus" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "No step-by-step reproduction instructions are provided. The methodology is described at a conceptual level but lacks concrete runnable instructions.", 143 "source": "opus" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "All results (AMCE values, MDD distances, alignment scores in Table 2) are reported as point estimates with no confidence intervals or error bars.", 151 "source": "opus" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are applied to any comparison. Differences between personas, between models, and between LLM and human responses are compared solely by point estimates.", 157 "source": "opus" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "AMCE (Average Marginal Component Effect) is itself an effect size measure from causal inference. MDD values are reported with specific magnitudes (e.g., MDD=0.48 for GPT-4o political persona). Table 2 gives Euclidean distances. These provide magnitude context.", 163 "source": "opus" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "The paper uses 10,000 generated scenarios without justifying why this number was chosen. No power analysis is discussed.", 169 "source": "opus" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "Results are single-run point estimates. With temperature=1 for GPT models and temperature=0.4 for Llama2, responses are stochastic, yet no variance, standard deviation, or spread across runs is reported.", 175 "source": "opus" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Two types of baselines are included: (1) a no-persona baseline for each LLM, and (2) human response baselines from Awad et al. (2018) for each demographic subgroup. These are shown in Fig. 2 (gray-shaded radar plots) and Table 2.", 183 "source": "opus" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "The study includes GPT-4o (2024), a contemporary model. Prior work comparisons reference Takemoto (2024) and Ahmad & Takemoto (2024), both recent. The human baselines from Awad et al. (2018) are the canonical dataset for this framework.", 189 "source": "opus" 190 }, 191 "ablation_study": { 192 "applies": false, 193 "answer": false, 194 "justification": "There is no multi-component system to ablate. The study examines the effect of persona prompts on LLM responses — the persona IS the experimental variable, not a system component.", 195 "source": "opus" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Multiple metrics are used: AMCE values (per-scenario effect sizes), MDD (Moral Decision Distance, Euclidean distance between persona AMCE vectors), Euclidean alignment scores (Table 2), and valid response rates (Table 4).", 201 "source": "opus" 202 }, 203 "human_evaluation": { 204 "applies": true, 205 "answer": false, 206 "justification": "No human evaluation of LLM outputs is performed. The comparison to human data uses pre-existing survey responses from Awad et al. (2018), not human judgments of LLM moral reasoning quality.", 207 "source": "opus" 208 }, 209 "held_out_test_set": { 210 "applies": false, 211 "answer": false, 212 "justification": "No training or tuning is performed in this study. The LLMs are prompted in a zero-shot manner, so the train/test distinction is not applicable.", 213 "source": "opus" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Extensive breakdowns are provided: by persona category (age, gender, culture, political, education, religion, income) in Figs. 2-4, by moral scenario dimension (9 dimensions) in Figs. 4 and 6, and per-model in Tables 2 and 4.", 219 "source": "opus" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Section 4.4 discusses 'moral flips' where LLM preferences reverse from human baselines. Fig. 5 quantifies misalignment percentages. Specific dimensions where preferences flip below 0 are identified (e.g., Social Status under progressive persona for GPT-4o).", 225 "source": "opus" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "Llama2's extremely low valid response rates for certain personas (conservative: 7.2%, religious: 6.0% in Table 4) are reported. The ~20% misalignment rate for GPT-3.5 and Llama2 is presented as a concerning finding. Llama2's high MDD values across all categories are highlighted.", 231 "source": "opus" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Exact model versions are specified in Section 4.1: 'gpt-4o-2024-05-13', 'gpt-3.5-turbo-0613, released in June 2023', and 'Llama-2-7b-chat-hf'.", 239 "source": "opus" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": false, 244 "justification": "The persona template is provided ('You {persona}. Your responses should closely mirror the knowledge and abilities of this persona.') with fill values in Table 1. However, the actual moral dilemma scenario text sent to the models is not provided — only the generation methodology from Takemoto (2024) is referenced. The full prompts cannot be reconstructed from the paper.", 245 "source": "opus" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Section 4.1 reports: GPT models use 'temperature set to 1 and top p set to 1' (Azure OpenAI API defaults); Llama2 uses 'top k equal to 10, top p equal to 0.9, a max length of 512, and temperature set to 0.4.'", 251 "source": "opus" 252 }, 253 "scaffolding_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "No agentic scaffolding is used. The study involves direct prompting of LLMs with persona instructions and scenario descriptions.", 257 "source": "opus" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "The pipeline is documented: 10,000 scenarios generated via constrained randomization across 9 categories (Section 4.1), valid response filtering described with rates in Table 4, AMCE computation methodology explained in Section 3.2, and culture AMCE weighted-average formula provided.", 263 "source": "opus" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "Neither the 10,000 generated scenarios nor the LLM response data are released. The human baseline data from Awad et al. (2018) is public, but the authors' own experimental outputs are not available for verification.", 271 "source": "opus" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Section 4.1 describes generating 10,000 scenarios via constrained randomization following Takemoto (2024), prompting 3 models with 14 personas + baseline using specified API settings, and the valid response filtering process (Table 4).", 277 "source": "opus" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants were recruited for this study. The human baseline data comes from the public Moral Machine experiment (Awad et al., 2018), a standard benchmark dataset.", 283 "source": "opus" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "The pipeline is traceable: scenario generation (10,000 via constrained randomization) → persona assignment (14 personas + baseline) → LLM response collection (3 models) → valid response filtering (rates in Table 4) → AMCE computation → MDD calculation. Culture AMCE weighted-average formula is provided.", 289 "source": "opus" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "No training data cutoff dates are stated for any of the three models. The paper specifies model versions but not when their training data was collected.", 297 "source": "opus" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "No discussion of whether the Moral Machine experiment data (published in Nature, 2018) — including its scenarios, AMCE values, and survey results — appeared in the training data of GPT-4o, GPT-3.5, or Llama2.", 303 "source": "opus" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "The Moral Machine experiment is a high-profile 2018 Nature publication. All three models were trained well after 2018 and likely encountered this data. The scenario generation follows Takemoto (2024), but whether the moral dilemma framework itself is contaminated in the models is never discussed.", 309 "source": "opus" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "This paper does not conduct a human subjects study. It uses pre-existing human data from the public Moral Machine dataset (Awad et al., 2018) as baselines.", 317 "source": "opus" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants were recruited. The study queries LLMs and compares to pre-existing public survey data.", 323 "source": "opus" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants were recruited for this study. The demographics of the original Moral Machine participants are described in Appendix A.2 as context for the baseline data.", 329 "source": "opus" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants were recruited. The study uses pre-existing public data.", 335 "source": "opus" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants were recruited or assigned to conditions.", 341 "source": "opus" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants were involved in this study.", 347 "source": "opus" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants were recruited for this study.", 353 "source": "opus" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": false, 360 "justification": "The study queries 3 LLMs across 10,000 scenarios × 15 persona conditions (150,000+ API calls per model). No cost, token usage, or latency information is reported.", 361 "source": "opus" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "No mention of total computational budget, GPU hours for Llama2 inference, API costs for GPT models, or wall-clock time for the experiments.", 367 "source": "opus" 368 } 369 }, 370 "experimental_rigor": { 371 "seed_sensitivity_reported": { 372 "applies": true, 373 "answer": false, 374 "justification": "Temperature=1 for GPT models introduces substantial stochasticity in responses. No multiple-seed or multiple-run analysis is reported.", 375 "source": "opus" 376 }, 377 "number_of_runs_stated": { 378 "applies": true, 379 "answer": false, 380 "justification": "The paper does not state how many times each scenario was run per model-persona combination. Results appear to be from a single run.", 381 "source": "opus" 382 }, 383 "hyperparameter_search_budget": { 384 "applies": true, 385 "answer": false, 386 "justification": "Hyperparameters are adopted from Takemoto (2024) for Llama2 and defaults for GPT models. No search budget is reported, and no justification is given for why these specific settings are optimal for this study.", 387 "source": "opus" 388 }, 389 "best_config_selection_justified": { 390 "applies": true, 391 "answer": true, 392 "justification": "Model selection is justified in Section 4.1: 'We select three top-performing models from their LLM series based on their strong alignment with human responses reported in Takemoto (2024), and their high valid response rate.'", 393 "source": "opus" 394 }, 395 "multiple_comparison_correction": { 396 "applies": true, 397 "answer": false, 398 "justification": "The study makes many comparisons across 7 persona categories × 2 groups × 9 moral dimensions × 3 models. No multiple comparison correction is applied — in fact, no statistical tests are performed at all.", 399 "source": "opus" 400 }, 401 "self_comparison_bias_addressed": { 402 "applies": false, 403 "answer": false, 404 "justification": "The authors do not propose a new system. They evaluate existing public models (GPT-4o, GPT-3.5, Llama2) against existing public human data, so self-comparison bias does not apply.", 405 "source": "opus" 406 }, 407 "compute_budget_vs_performance": { 408 "applies": false, 409 "answer": false, 410 "justification": "The study is not comparing competing methods at different compute budgets. It examines behavioral patterns across persona conditions.", 411 "source": "opus" 412 }, 413 "benchmark_construct_validity": { 414 "applies": true, 415 "answer": true, 416 "justification": "Section 5 ('Expanding Moral Machine Scenarios') explicitly discusses construct validity: the Moral Machine 'predominantly scrutinizes ethical choices related to autonomous vehicles, which represents a narrow subset of ethical issues' and calls for expanding to 'other types of moral decisions, such as allocation of healthcare resources, judicial decisions, or climate change policies.'", 417 "source": "opus" 418 }, 419 "scaffold_confound_addressed": { 420 "applies": false, 421 "answer": false, 422 "justification": "No scaffolding is used. Models are directly prompted with persona instructions and scenario descriptions.", 423 "source": "opus" 424 } 425 }, 426 "data_leakage": { 427 "temporal_leakage_addressed": { 428 "applies": true, 429 "answer": false, 430 "justification": "The Moral Machine experiment (2018) and all its data predate the training of all three models. Whether models have memorized patterns from this widely-cited Nature paper is not discussed.", 431 "source": "opus" 432 }, 433 "feature_leakage_addressed": { 434 "applies": true, 435 "answer": false, 436 "justification": "No discussion of whether the structured moral dilemma scenarios leak information about expected responses. The moral machine framework's binary choice structure may cue socially desirable responses.", 437 "source": "opus" 438 }, 439 "non_independence_addressed": { 440 "applies": true, 441 "answer": false, 442 "justification": "No discussion of whether the 10,000 generated scenarios share structural dependencies (constrained randomization could produce correlated scenarios) or whether model responses across scenarios are independent.", 443 "source": "opus" 444 }, 445 "leakage_detection_method": { 446 "applies": true, 447 "answer": false, 448 "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference tests, or contamination analysis.", 449 "source": "opus" 450 } 451 } 452 } 453 }, 454 "claims": [ 455 { 456 "claim": "LLM moral decisions vary substantially by persona while human decisions remain robust to demographic context", 457 "evidence": "Figure 3 shows Llama2 MDD=0.56 (age), GPT-3.5 MDD=0.48 (political) vs human MDD=0.33 (age). Figure 4 shows human preferences consistently >0 while LLM preferences flip negative for multiple dimensions (e.g., gender, species, age).", 458 "supported": "strong" 459 }, 460 { 461 "claim": "Political persona induces largest decision shifts in LLMs, reflecting partisan sorting", 462 "evidence": "Figure 3: GPT-4o political MDD=0.48 (largest among all personas). Figure 4 shows conservative persona favors high status, progressive favors low status. Mentioned as 'political persona predominates the direction and degree of LLM decisions.'", 463 "supported": "strong" 464 }, 465 { 466 "claim": "Approximately 20% of LLM moral decisions misalign with human responses under persona conditions", 467 "evidence": "Figure 5: GPT-3.5 ~20%, Llama2 ~20% show decision flips vs human baseline. GPT-4o lower (~5%).", 468 "supported": "strong" 469 }, 470 { 471 "claim": "GPT-4o aligns better with human moral reasoning than GPT-3.5 and Llama2", 472 "evidence": "Table 2: GPT-4o has 7/14 bold (lowest) values; GPT-3.5 7/14; Llama2 0/14. Figure 3: GPT-4o has lower MDD across most categories. Valid response rates highest for GPT-4o (Table 4).", 473 "supported": "strong" 474 }, 475 { 476 "claim": "Llama2 exhibits severe guardrail-triggered refusals, limiting evaluability for certain personas", 477 "evidence": "Table 4: Llama2 valid response rate for conservative persona=7.2%, female=17.1%, religious=6.0%. These 93%+ refusal rates are reported but not flagged as validity concern.", 478 "supported": "strong" 479 }, 480 { 481 "claim": "Persona effects are consistent within model but differ between models (e.g., GPT-4o affected mainly by political, others by all factors)", 482 "evidence": "Figure 6: GPT-4o shows 'high variance predominantly by political persona' while 'GPT-3.5 and Llama2 show large changes for all other persona settings.' Patterns differ per model.", 483 "supported": "strong" 484 } 485 ], 486 "methodology_tags": [ 487 "benchmark-eval", 488 "observational" 489 ], 490 "key_findings": "LLMs exhibit persona-dependent moral reasoning that diverges sharply from human responses, particularly under political identity conditions. Unlike humans who maintain consistent moral preferences regardless of stated demographic characteristics, all three tested models show substantial shifts in decision-making when assigned personas, with political persona inducing the largest alignment gaps (MDD=0.48 for GPT-4o vs 0.33 for age). This 'partisan sorting' effect suggests LLMs may amplify rather than mimic human moral biases. However, Llama2 suffers catastrophic refusal rates for certain personas (7-18% valid responses), raising questions about the generalizability of findings across model families.", 491 "red_flags": [ 492 { 493 "flag": "No statistical hypothesis testing", 494 "detail": "All results reported as point estimates (MDD, AMCE) without confidence intervals, p-values, or significance tests. Cannot determine if observed differences exceed noise or are reliable." 495 }, 496 { 497 "flag": "Critical failure mode not discussed", 498 "detail": "Llama2 valid response rates collapse for certain personas (conservative 7.2%, female 17.1%, religious 6.0%), meaning >90% of responses are refusals/NA, yet this is not flagged as a validity threat or analyzed." 499 }, 500 { 501 "flag": "No code or data release", 502 "detail": "AMCE and MDD calculations cannot be verified. 10,000 generated scenarios and LLM responses not available for independent replication or audit." 503 }, 504 { 505 "flag": "Benchmark contamination not addressed", 506 "detail": "Moral Machine (2018) is a well-known public benchmark. Models trained to 2024 almost certainly saw it during pretraining, potentially biasing results, but paper does not discuss this risk." 507 }, 508 { 509 "flag": "Scenario generation underspecified", 510 "detail": "'Constrained randomization' described but constraints not detailed. How are 10K scenarios sampled? Without this, scenario bias cannot be assessed." 511 }, 512 { 513 "flag": "Invalid response handling undocumented", 514 "detail": "Table 4 shows refusal rates but paper does not document how invalid/refused responses are handled in AMCE calculations (excluded, imputed, or treated as random choice?)." 515 }, 516 { 517 "flag": "No ablation study", 518 "detail": "Only tests persona ON vs OFF. Does not systematically ablate prompt wording, number of persona dimensions, or order of information to isolate what drives effect." 519 }, 520 { 521 "flag": "Sample size not justified", 522 "detail": "10,000 scenarios stated but no power analysis or justification. Is this adequate to detect differences between persona conditions?" 523 } 524 ], 525 "cited_papers": [ 526 { 527 "title": "The Moral Machine Experiment", 528 "relevance": "Foundational dataset and task framework. Original collection of 11M human moral preferences used as baseline for LLM alignment evaluation." 529 }, 530 { 531 "title": "The moral machine experiment on large language models (Takemoto 2024)", 532 "relevance": "Prior work examining LLM behavior on Moral Machine. Shows variation by LLM version. Current study extends this by exploring persona-dependent effects." 533 }, 534 { 535 "title": "Large-scale moral machine experiment on large language models (Ahmad & Takemoto 2024)", 536 "relevance": "Earlier large-scale comparison of multiple LLMs on moral machine, establishing baseline alignment measures that current study extends with persona variation." 537 }, 538 { 539 "title": "Language model alignment in multilingual trolley problems (Jin et al. 2024)", 540 "relevance": "Explores language effects on LLM moral decisions. Current study complements by exploring sociodemographic persona effects within single language context." 541 }, 542 { 543 "title": "Bias runs deep: Implicit reasoning biases in persona-assigned LLMs (Gupta et al. 2024)", 544 "relevance": "Shows persona-based prompting can expose/amplify biases in LLMs. Directly related methodological approach for persona assignment in current work." 545 }, 546 { 547 "title": "Moral foundations of large language models (Abdulhai et al. 2023)", 548 "relevance": "Explores political bias and moral foundations in LLMs. Paper references this to interpret partisan sorting finding observed in results." 549 }, 550 { 551 "title": "Moral mimicry: Large language models produce moral rationalizations tailored to political identity (Simmons 2022)", 552 "relevance": "Early work showing LLMs generate text biased to political identity when assigned liberal/conservative personas. Foundational for current study's political persona hypothesis." 553 } 554 ], 555 "engagement_factors": { 556 "practical_relevance": { 557 "score": 1, 558 "justification": "Provides insights for anyone deploying LLMs in persona-sensitive contexts, but offers no tool, technique, or concrete mitigation strategy." 559 }, 560 "surprise_contrarian": { 561 "score": 2, 562 "justification": "The finding that LLMs are far more sensitive to political personas than humans — inverting the human pattern where politics barely affects moral decisions — is surprising and counterintuitive." 563 }, 564 "fear_safety": { 565 "score": 1, 566 "justification": "Raises concerns about persona-driven bias in morally sensitive applications, but does not demonstrate a novel attack or existential risk." 567 }, 568 "drama_conflict": { 569 "score": 1, 570 "justification": "The political sycophancy finding has some controversy potential, but the paper is measured in tone and focused on an academic framework." 571 }, 572 "demo_ability": { 573 "score": 0, 574 "justification": "No code, demo, or reproducible artifact is released." 575 }, 576 "brand_recognition": { 577 "score": 2, 578 "justification": "Evaluates GPT-4o and Llama2 — well-known models — and is published at ICLR 2025, a top venue." 579 } 580 }, 581 "hn_data": { 582 "threads": [ 583 { 584 "hn_id": "46728063", 585 "title": "New York Times games are hard: A computational perspective", 586 "points": 73, 587 "comments": 33, 588 "url": "https://news.ycombinator.com/item?id=46728063" 589 }, 590 { 591 "hn_id": "43205755", 592 "title": "Towards an AI Co-Scientist", 593 "points": 47, 594 "comments": 17, 595 "url": "https://news.ycombinator.com/item?id=43205755" 596 }, 597 { 598 "hn_id": "44253021", 599 "title": "SmartAttack: Air-Gap Attack via Smartwatches", 600 "points": 18, 601 "comments": 6, 602 "url": "https://news.ycombinator.com/item?id=44253021" 603 }, 604 { 605 "hn_id": "44272942", 606 "title": "Securing Credit Inquiries: Real-Time User Approval to Stop SSN Identity Theft", 607 "points": 6, 608 "comments": 0, 609 "url": "https://news.ycombinator.com/item?id=44272942" 610 }, 611 { 612 "hn_id": "43763905", 613 "title": "Visual Language Models show widespread deficits on neuropsychological tests", 614 "points": 3, 615 "comments": 0, 616 "url": "https://news.ycombinator.com/item?id=43763905" 617 }, 618 { 619 "hn_id": "44366937", 620 "title": "SmartAttack: Air-Gap Attack via Smartwatches", 621 "points": 2, 622 "comments": 0, 623 "url": "https://news.ycombinator.com/item?id=44366937" 624 }, 625 { 626 "hn_id": "44254732", 627 "title": "SmartAttack: Air-Gap Attack via Smartwatches", 628 "points": 2, 629 "comments": 0, 630 "url": "https://news.ycombinator.com/item?id=44254732" 631 }, 632 { 633 "hn_id": "46345690", 634 "title": "Computational complexity of New York Times games", 635 "points": 1, 636 "comments": 0, 637 "url": "https://news.ycombinator.com/item?id=46345690" 638 }, 639 { 640 "hn_id": "22971877", 641 "title": "Impact of Bias on School Admissions and Targeted Interventions", 642 "points": 1, 643 "comments": 0, 644 "url": "https://news.ycombinator.com/item?id=22971877" 645 } 646 ], 647 "top_points": 73, 648 "total_points": 153, 649 "total_comments": 56 650 } 651 }