ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (34372B)


      1 {
      2   "paper": {
      3     "title": "Moral Alignment for LLM Agents",
      4     "authors": [
      5       "Elizaveta Tennant",
      6       "Stephen Hailes",
      7       "Mirco Musolesi"
      8     ],
      9     "year": 2024,
     10     "venue": "International Conference on Learning Representations",
     11     "arxiv_id": "2410.01639",
     12     "doi": "10.48550/arXiv.2410.01639"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "Fine-tuning LLM agents (Gemma2-2b-it) with intrinsic moral rewards via PPO successfully teaches Deontological and Utilitarian strategies on the Iterated Prisoner's Dilemma. Deontological fine-tuning generalizes robustly to other matrix games (Stag Hunt, Chicken, Bach or Stravinsky, Defective Coordination), while Utilitarian fine-tuning produces a cooperation bias that fails on games where defection maximizes collective welfare. Moral fine-tuning can partially unlearn a previously acquired selfish strategy, and fine-tuning on structured game prompts influences model behavior on unrelated prompts involving the same action tokens.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "GitHub repository provided in footnote 4: 'Code (fine-tuning and analysis): https://github.com/liza-tennant/LLM_morality.'"
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The environments are standard well-known matrix games (IPD, Stag Hunt, Chicken, Bach or Stravinsky, Defective Coordination) with payoff matrices fully specified in the paper (Table 3). Data is procedurally generated through game interactions. The released code includes the game implementations."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Appendix 8.1 lists specific package versions: 'trl 0.9.4, peft 0.11.1, transformers 4.42.3.' Hardware specified as 'single A100 or V100 GPU with up to 40GB VRAM.' 4-bit quantization and LoRA rank 64 are stated."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "While implementation details are extensive (Section 3.3, Appendix 8.1, Table 2) and code is released, the paper itself does not include step-by-step reproduction instructions with specific commands to run."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "95% confidence intervals reported throughout. Figure 5 caption: 'averaging values over 50 test games and five runs (+- 95%CI).' Figure 13 also shows 95% CIs."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No statistical significance tests are reported. Comparisons between reward types and conditions rely on visual inspection of plots with confidence intervals, but no formal tests (t-tests, Mann-Whitney, etc.) are applied."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No formal effect size measures (Cohen's d, odds ratios, etc.) are reported. Results are presented as action type percentages and normalized moral regret values without formal effect size quantification."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No power analysis or formal justification for sample sizes. Five random seeds, batch sizes of 3 and 5, and 1000 training episodes are used. Batch size choice is justified only as 'strikes a nice balance between not running out of available CUDA memory, yet providing sufficient experience for stable and efficient training' (Section 3.3)."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Results are averaged across five random seeds with 95% confidence intervals. Section 4.1: 'For each experiment, we report average results across five random seeds.' CIs are shown in all main figures."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Multiple baselines: no fine-tuning (base Gemma2-2b-it), Game reward (selfish baseline), value-prompted baselines (Appendix 8.11), and comparisons across six reward types (Game, Deontological, Utilitarian, Game+Deontological, Game→Deontological, Game→Utilitarian)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Gemma2-2b-it is a recent open-source model. The baselines are appropriate for the experimental design: different reward signal types and prompting strategies. Value-prompted baselines (Appendix 8.11) provide a reasonable comparison point."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Systematic variation of reward types (Game, Deontological, Utilitarian, combinations) serves as ablation. Additional ablations include: different opponents (TFT, Random, AD, AC, LLM — Appendix 8.5), reversed action tokens (Appendix 8.4), permuted payoff matrices (Appendix 8.9), and various prompt formats (Appendix 8.10)."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Multiple evaluation metrics: action type distributions (Figures 3, 6), Deontological moral regret (Figure 5a), Utilitarian moral regret (Figure 5b), moral reward during training (Figure 13), reciprocity analysis (Figure 20)."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation of the agents' moral behavior. All evaluation is automated: action type classification, moral regret computation, and reward tracking. Human judgment of whether agent behavior is truly 'moral' could be informative but is not included."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Generalization evaluated on four held-out matrix games not used during training (Stag Hunt, Chicken, Bach or Stravinsky, Defective Coordination). New action tokens (action3/action4) used at test time instead of training tokens (action1/action2). Section 5.1: 'we run this evaluation using a new pair of action tokens.'"
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results broken down by: game type (Figures 5, 6), reward type, action-state combinations (C|C, C|D, D|C, D|D), opponent type (Appendix 8.5), and prompt variation (Appendix 8.10). Per-game analysis allows identification of where generalization succeeds or fails."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Extensive failure analysis: Utilitarian model's inability to defect when needed on Defective Coordination game (Section 5.1), poor generalization on coordination games, occasional non-convergence during training (Appendix 8.1: 'Occasionally...the training did not converge as the LLM never produced a legal token'), and reversed token evaluation failures (Appendix 8.9)."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Several negative results: value-prompted baselines showed no improvement over base model (Appendix 8.11: 'non fine-tuned models were just as unable to produce legal tokens'), Utilitarian generalization fails on Defective Coordination, and unlearning converges incompletely ('the training does not converge to levels of cooperation as high as in the purely prosocial fine-tuning', Section 4.3)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims are hedged appropriately: 'promising general solution' (not 'proven'), 'might represent a more transparent and cost-effective alternative.' Specific claims about IPD learning, unlearning, and generalization to matrix games are supported by Figures 3-6. The abstract accurately notes generalization applies to 'certain moral strategies' and 'several other matrix game environments.'"
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Causal claims (e.g., 'fine-tuning with intrinsic rewards allows agents to learn moral strategies') are justified by the experimental design: controlled manipulation of a single variable (reward type) across conditions, multiple seeds, comparison against baselines. The RL fine-tuning setup is a controlled intervention."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "Title 'Moral Alignment for LLM Agents' implies broad applicability across LLM agents, but experiments use only Gemma2-2b-it (a 2B parameter model) on matrix game environments. Section 2.1 acknowledges adopting 'a particularly small open-source model' but the title and framing do not bound findings to this specific model class or environment type."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Multiple alternative explanations discussed: Utilitarian model may learn cooperation bias rather than utilitarian reasoning (Section 5.1), agents may learn token ordering rather than semantics (Appendix 8.8: 'the model simply learned to choose the first token of the two'), pre-training data may contain IPD knowledge (Section 3.1), and token position in payoff matrix may influence behavior (Appendix 8.9)."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper explicitly defines 'moral alignment' in terms of specific, operationalized reward functions (Table 1): Deontological = 'do not defect against a cooperator', Utilitarian = maximize collective payoff. The gap between matrix game behavior and broader moral alignment is acknowledged: 'A limitation of this approach is that it requires the specification of rewards for a particular environment' (Section 6)."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Gemma2-2b-it is specified as the model (Section 3.3), which is a specific open-source model checkpoint available on HuggingFace with a fixed architecture and weights. This is not a marketing name or API endpoint — it identifies a concrete model artifact."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Full prompt text provided in Figures 2, 7, 8, 9, 10, 11, 12, and 25. These include the exact game prompts used during training and all evaluation prompts. The actual text sent to the model is reproduced verbatim."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Extensive hyperparameter reporting in Section 3.3 and Appendix 8.1: LoRA rank 64, 4-bit quantization, batch sizes 3 and 5, gradient accumulation 4 steps, reward parameters ξ=3 and Rillegal=-6, T=1000 episodes, max generation length 2 tokens. Table 2 lists all parameter values tried. Default PPO parameters from TRL used."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. The LLM receives a structured prompt and directly outputs an action token. There are no tools, memory systems, retry logic, or workflow orchestration."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "The data generation pipeline is fully documented: prompt construction with game state (Section 3.1), action parsing from model output, reward computation (Table 1), illegal move handling, and state update procedure (Section 3.2). Random state initialization and episode structure are described."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 6 (Discussion) substantively discusses limitations: environment-specific reward specification, extension to more complex games, the need for multi-objective pluralistic alignment, and scalability to real-world scenarios."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Specific threats discussed: (1) the approach requires environment-specific reward specification (Section 6), (2) small model may not represent larger models (Section 2.1), (3) Utilitarian agent may learn cooperation bias rather than utilitarian reasoning (Section 5.1), (4) token ordering may confound semantic learning (Appendix 8.8), (5) occasional non-convergence (Appendix 8.1)."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 6 states: 'A limitation of this approach is that it requires the specification of rewards for a particular environment, whereas methods like RLHF rely on natural language data describing any domain.' Section 2.1 notes adoption of 'a particularly small open-source model.' Future work sections explicitly state what extensions are needed."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "Raw experimental data (training logs, action histories per episode, per-seed results) are not released. The code to regenerate data is available, but the actual raw data from the reported experiments is not provided for independent verification."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Data collection (game interaction) procedure thoroughly described: prompt construction (Figures 2, 7), episode structure with T=1000 episodes and batch sizes N=3/5 (Section 3.2), random state initialization, and action recording protocol."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data comes from LLM game interactions with algorithmic or LLM opponents. The model (Gemma2-2b-it) is a standard publicly available model."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Full pipeline documented: random state generation → prompt construction → model generation (max 2 tokens) → action parsing (legal vs illegal) → reward computation (Table 1) → PPO update (Section 3.2). Illegal move handling is explicitly described."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Acknowledgments section discloses: 'partially supported by the Leverhulme Trust through the Doctoral Training Programme' and 'partially supported by the Italian Ministry of University and Research (MUR) through the project PRIN 2022 MENTOR.'"
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations clearly listed: University College London and University of Bologna. Authors are not affiliated with a company whose product is being evaluated."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Funders are academic bodies (Leverhulme Trust, Italian MUR) with no financial interest in the outcomes of LLM moral alignment research."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement found in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The training data cutoff for Gemma2-2b-it is not stated. The model likely encountered Prisoner's Dilemma strategies in pre-training data, and knowing the cutoff would help assess this risk."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Section 3.1 explicitly addresses this: 'we aim to mobilize the general decision-making elements of the model in playing the game, rather than allowing it to retrieve memorized responses for the Prisoner's Dilemma.' Abstract action tokens (action1/action2) used to avoid triggering IPD-specific pre-trained responses."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Active mitigation measures taken: abstract action tokens instead of 'Cooperate'/'Defect', no mention of 'Prisoner's Dilemma' in prompts, new tokens at test time (action3/action4), reversed token evaluation (Appendix 8.4, 8.9). Section 3.1: 'we use a structured, implicit representation of the IPD as a general decision-making game.'"
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. All experiments involve LLM agents playing game-theoretic environments."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. Study involves computational experiments with LLM agents."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference cost or latency reported. Hardware is mentioned (A100/V100) but per-experiment cost, training time, or inference latency are not quantified."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "Hardware mentioned ('single A100 or V100 GPU with up to 40GB VRAM') but total GPU hours, wall-clock training time, or total compute budget are not stated."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Results reported across 5 random seeds with 95% confidence intervals. Section 4.1: 'For each experiment, we report average results across five random seeds.' Figures show variability across runs."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "Explicitly stated: 'we report average results across five random seeds' (Section 4.1). Appendix 8.1 also notes occasional non-convergence: 'Occasionally (on one in six of the early runs), the training did not converge.'"
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Table 2 in Appendix 8.1 lists all parameter values tried across 8 hyperparameters including LoRA rank, target modules, KL control, gradient accumulation, reward normalization, Rillegal values, and payoff ranges. Action token choices also documented. Selection criterion stated: 'most stable fine-tuning.'"
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "Appendix 8.1: 'We chose the combination of values that resulted in the most stable fine-tuning.' Table 2 transparently shows all configurations tried, and the selection criterion (stability) is stated."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors implement all reward types and baselines themselves. No discussion of author-evaluation bias or use of independent evaluation. All comparisons are between the authors' own implementations of different reward functions."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": false,
    329         "answer": false,
    330         "justification": "All conditions use the same compute budget (1000 episodes, same hardware, same model). Compute differences between conditions are negligible by design."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "Section 6 discusses whether matrix games capture real moral alignment. Section 2.3 explains why the IPD 'represents many daily situations that might involve difficult social and ethical choices.' The paper acknowledges the gap between game-theoretic behavior and broader moral alignment, and discusses needed extensions."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved. The model directly receives a prompt and outputs an action token."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No explicit discussion of temporal leakage. The game environments (IPD, Stag Hunt, etc.) and their optimal strategies have been published for decades and are certainly in the model's pre-training data. The training cutoff date is not stated."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": true,
    352         "justification": "The evaluation setup is transparent: the model sees the payoff matrix and previous action as the complete input. Section 3.1 explicitly designs prompts to avoid leaking game identity ('without actually stating the terms Prisoner's Dilemma, cooperation or defection')."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether training and test episodes share structural dependencies. Training and evaluation both use the same IPD environment. While generalization tests use different games, the IPD evaluation uses the same environment as training."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": true,
    362         "justification": "Concrete detection methods used: new action tokens at test time (action3/action4 vs training action1/action2) to verify semantic learning vs memorization (Section 5.1), reversed token meaning tests (Appendix 8.9), and permuted payoff matrix orderings (Appendix 8.9, Figure 10)."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "LLM agents can be fine-tuned with intrinsic moral rewards to learn aligned moral strategies on the IPD.",
    369       "evidence": "Figure 3 shows learning dynamics: Deontological agents learn to avoid defecting against cooperators ~100% of the time, Utilitarian agents achieve mutual cooperation vs TFT, Game agents learn defective Nash equilibrium. Results averaged over 5 seeds with 95% CIs (Section 4.2).",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Moral fine-tuning can enable unlearning of a previously developed selfish strategy.",
    374       "evidence": "Figure 4 shows Game→Deontological and Game→Utilitarian transitions at episode 500. Agents partially reverse selfish behavior in both LLM vs TFT and LLM vs LLM settings. However, 'the training does not converge to levels of cooperation as high as in the purely prosocial fine-tuning' (Section 4.3).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Moral strategies learned on the IPD generalize to other matrix game environments.",
    379       "evidence": "Figures 5 and 6 show test-time performance on 5 games with new action tokens. Deontological agents maintain low moral regret across all games. Utilitarian agents perform well on IPD and Chicken but poorly on coordination games (Section 5.1).",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Deontological fine-tuning produces especially robust generalization across games.",
    384       "evidence": "Figure 5a shows Deontological agents have the lowest moral regret across all five games. Figure 6 shows they consistently avoid defecting against cooperators regardless of game type (Section 5.1).",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "The Utilitarian model learns a cooperation bias rather than true utilitarian reasoning.",
    389       "evidence": "Figure 6 shows the Utilitarian model 'essentially always chooses to cooperate, regardless of its opponent's last move or the game's payoff structure.' This fails on Defective Coordination where mutual defection maximizes collective payoff (Section 5.1).",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Fine-tuning on matrix games influences model behavior on unrelated prompts.",
    394       "evidence": "Figures 18-20 and Appendix 8.8 show that fine-tuned models respond to game-related prompts without payoff matrices in patterns consistent with their learned moral values, especially when prompts mention a 'game' or include opponent state (Section 5.2).",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "This approach is a more transparent and cost-effective alternative to RLHF/DPO.",
    399       "evidence": "No direct comparison with RLHF or DPO is conducted. The claim is made argumentatively in the introduction and abstract ('might represent') without empirical support for cost-effectiveness or transparency advantages.",
    400       "supported": "unsupported"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "Single small model tested",
    406       "detail": "All experiments use only Gemma2-2b-it (2B parameters). The title and framing suggest applicability to 'LLM Agents' broadly, but results may not transfer to larger models, different architectures, or commercial LLMs. Section 2.1 acknowledges choosing a 'particularly small open-source model' but the generality of findings to other models is unknown."
    407     },
    408     {
    409       "flag": "Extremely simplified evaluation environment",
    410       "detail": "Matrix games with 2 actions and single-step history are far from real-world moral dilemmas. The 'generalization' tested is from one 2x2 matrix game to other 2x2 matrix games — all share the same action space, prompt format, and payoff structure. This does not establish generalization to natural language moral scenarios."
    411     },
    412     {
    413       "flag": "No direct comparison with existing alignment methods",
    414       "detail": "The abstract claims the approach 'might represent a more transparent and cost-effective alternative to currently predominant alignment techniques' (RLHF, DPO), but no direct comparison is performed. The value-prompted baselines (Appendix 8.11) are the only alternative, and they failed entirely because the base model couldn't produce legal tokens."
    415     },
    416     {
    417       "flag": "Utilitarian learns surface pattern, not value",
    418       "detail": "The Utilitarian agent learns to always cooperate regardless of context, which happens to align with utilitarian goals on the IPD but fails on Defective Coordination. This suggests the model learned a behavioral shortcut (always pick action1) rather than internalizing utilitarian reasoning about collective welfare."
    419     },
    420     {
    421       "flag": "No significance tests for comparative claims",
    422       "detail": "Claims comparing reward types (e.g., 'Agents trained on the Deontological reward in particular are especially able to maintain this moral policy') are made based on visual inspection of plots with confidence intervals, without formal statistical tests."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    428       "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"],
    429       "year": 2023,
    430       "relevance": "Foundational RLHF alignment paper; this work proposes intrinsic rewards as an alternative to the human-feedback approach."
    431     },
    432     {
    433       "title": "Constitutional AI: Harmlessness from AI feedback",
    434       "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu"],
    435       "year": 2022,
    436       "arxiv_id": "2212.08073",
    437       "relevance": "Constitutional AI approach to alignment without human preference data; related alternative alignment paradigm."
    438     },
    439     {
    440       "title": "Direct Preference Optimization: Your language model is secretly a reward model",
    441       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    442       "year": 2023,
    443       "relevance": "Major alignment method (DPO) that this work contrasts with as an implicit-values approach."
    444     },
    445     {
    446       "title": "Foundational challenges in assuring alignment and safety of large language models",
    447       "authors": ["Usman Anwar", "Abulhair Saparov", "Javier Rando"],
    448       "year": 2024,
    449       "relevance": "Comprehensive survey of LLM alignment challenges including goal misgeneralization and pluralistic alignment."
    450     },
    451     {
    452       "title": "The alignment problem from a deep learning perspective",
    453       "authors": ["Richard Ngo", "Lawrence Chan", "Sören Mindermann"],
    454       "year": 2024,
    455       "relevance": "Analyzes alignment from deep learning perspective including situational awareness and goal misgeneralization."
    456     },
    457     {
    458       "title": "Open problems and fundamental limitations of Reinforcement Learning from Human Feedback",
    459       "authors": ["Stephen Casper", "Xander Davies", "Claudia Shi"],
    460       "year": 2023,
    461       "relevance": "Identifies fundamental limitations of RLHF that motivate alternative alignment approaches like intrinsic rewards."
    462     },
    463     {
    464       "title": "Playing repeated games with large language models",
    465       "authors": ["Elif Akata", "Lion Schulz", "Julian Coda-Forno"],
    466       "year": 2023,
    467       "arxiv_id": "2305.16867",
    468       "relevance": "Studies LLM behavior in repeated games including IPD; finds 'unforgiving' strategies in large models."
    469     },
    470     {
    471       "title": "GTBench: Uncovering the strategic reasoning limitations of llms via game-theoretic evaluations",
    472       "authors": ["Jinhao Duan", "Renming Zhang", "James Diffenderfer"],
    473       "year": 2024,
    474       "relevance": "Benchmark for LLM strategic reasoning in games including the IPD; suggests models lack true strategic reasoning."
    475     },
    476     {
    477       "title": "Can large language models serve as rational players in game theory? A systematic analysis",
    478       "authors": ["Caoyun Fan", "Jindou Chen", "Yaohui Jin"],
    479       "year": 2024,
    480       "relevance": "Analyzes LLM rationality in game-theoretic settings; finds LLMs act differently from humans and are not fully rational."
    481     },
    482     {
    483       "title": "Training socially aligned language models on simulated social interactions",
    484       "authors": ["Ruibo Liu", "Ruixin Yang", "Chenyan Jia"],
    485       "year": 2024,
    486       "relevance": "Trains socially aligned LLMs through simulated interactions; related approach to social/moral alignment."
    487     },
    488     {
    489       "title": "A roadmap to Pluralistic Alignment",
    490       "authors": ["Taylor Sorensen", "Jared Moore", "Jillian Fisher"],
    491       "year": 2024,
    492       "relevance": "Addresses pluralistic alignment — aligning to diverse human values — which this paper's multi-reward approach could support."
    493     },
    494     {
    495       "title": "Generative agents: Interactive simulacra of human behavior",
    496       "authors": ["Joon Sung Park", "Joseph C. O'Brien", "Carrie J. Cai"],
    497       "year": 2023,
    498       "relevance": "Influential LLM agent framework; demonstrates decision-making in open-ended environments."
    499     },
    500     {
    501       "title": "AI Alignment: A comprehensive survey",
    502       "authors": ["Jiaming Ji", "Tianyi Qiu", "Boyuan Chen"],
    503       "year": 2024,
    504       "arxiv_id": "2310.19852",
    505       "relevance": "Comprehensive survey of AI alignment approaches; provides context for the intrinsic rewards alternative."
    506     }
    507   ],
    508   "engagement_factors": {
    509     "practical_relevance": {
    510       "score": 1,
    511       "justification": "Method is specific to matrix game environments with explicit payoff matrices; not immediately applicable to real-world alignment of deployed LLMs."
    512     },
    513     "surprise_contrarian": {
    514       "score": 1,
    515       "justification": "Using explicit moral reward functions instead of implicit human preferences is a known idea; the contribution is demonstrating it works for LLM fine-tuning, not challenging a widely-held belief."
    516     },
    517     "fear_safety": {
    518       "score": 1,
    519       "justification": "Touches AI alignment broadly but does not demonstrate a novel attack or raise specific safety concerns."
    520     },
    521     "drama_conflict": {
    522       "score": 0,
    523       "justification": "No controversy, no criticism of existing approaches or labs, purely constructive research contribution."
    524     },
    525     "demo_ability": {
    526       "score": 1,
    527       "justification": "Code released on GitHub but requires GPU, RL training setup, and familiarity with the TRL library to reproduce."
    528     },
    529     "brand_recognition": {
    530       "score": 1,
    531       "justification": "UCL and University of Bologna are established universities but not high-profile AI labs. Published at ICLR 2025 which is prestigious."
    532     }
    533   }
    534 }

Impressum · Datenschutz