ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28046B)


      1 {
      2   "paper": {
      3     "title": "Psychometric Personality Shaping Modulates Capabilities and Safety in Language Models",
      4     "authors": ["Stephen Fitz", "Peter Romero", "Steven Basart", "Sipeng Chen", "José Hernández-Orallo"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2509.16332",
      8     "doi": "10.48550/arXiv.2509.16332"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Personality shaping via Big Five prompts systematically modulates LLM performance on safety benchmarks (WMDP, TruthfulQA, ETHICS, Sycophancy) largely independently of capability (MMLU). Low Conscientiousness causes catastrophic drops (20-40 pp) across both safety and capability, while adversarial dark-triad combinations degrade safety by 20+ pp with minimal capability loss. The effects are robust across prompt variations with large Cohen's d values (>1.0 for most safety benchmarks). More capable models show greater sensitivity to personality shaping, suggesting they are better at enacting prompted personas.",
     14   "claims": [
     15     {
     16       "claim": "Reducing Conscientiousness leads to significant drops in safety-relevant metrics (20-40 pp) and general capabilities (MMLU)",
     17       "evidence": "Tables 8-12 show CON_LO causing drops across all models: GPT-4.1 ETHICS_CM drops from 71.4% to 38.7%, MMLU from 84.6% to 67.8% (Table 8). Similar patterns across all five models.",
     18       "supported": "strong"
     19     },
     20     {
     21       "claim": "Personality shaping can alter safety scores while leaving capability (MMLU) largely unchanged, decoupling safety from capability",
     22       "evidence": "Section 5: MEDIUM Agreeableness increases Llama-4 TruthfulQA by +9.2 while MMLU shifts only -0.9. Dark-triad combination drops ETHICS_CM by 26.4 pp in GPT-4.1 with MMLU loss below 3 pp (Figure 4).",
     23       "supported": "strong"
     24     },
     25     {
     26       "claim": "High Extraversion reliably reduces factual honesty (TruthfulQA)",
     27       "evidence": "Section 5: GPT-4.1 falls 4.6 pp, Llama-4 falls 6.8 pp, Llama-3-70B falls 9.4 pp on TruthfulQA under High Extraversion (Tables 8-11).",
     28       "supported": "strong"
     29     },
     30     {
     31       "claim": "Personality effects are robust across prompt variations with large effect sizes",
     32       "evidence": "Appendix B: Cohen's d for CON_HI vs CON_LO on MMLU is 1.78, ETHICS-CM 2.47, TruthfulQA 2.16 (Table 6). Robustness tested across semantic tone, syntactic structure, postamble formulation, and temperature variations.",
     33       "supported": "strong"
     34     },
     35     {
     36       "claim": "More capable models are more sensitive to personality shaping because they are 'better actors'",
     37       "evidence": "Section 5 discusses GPT-4.1 being most brittle to Low Conscientiousness. However, this is speculative — one explanation among several offered ('other mechanisms may also contribute').",
     38       "supported": "moderate"
     39     },
     40     {
     41       "claim": "These findings 'put all reported results of safety benchmarks into question'",
     42       "evidence": "Section 7 conclusion. This is an overclaim — the study tests 5 models with one prompting strategy. The results show personality sensitivity but do not invalidate all prior safety benchmark results.",
     43       "supported": "weak"
     44     }
     45   ],
     46   "checklist": {
     47     "artifacts": {
     48       "code_released": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No repository URL, code archive, or link to source code found anywhere in the paper."
     52       },
     53       "data_released": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "All benchmarks used (MMLU, TruthfulQA, WMDP, ETHICS, Sycophancy) are publicly available. Full prompt text is provided in Appendix D. IPIP-NEO and SD3 are public-domain instruments."
     57       },
     58       "environment_specified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper mentions 'CPU-only cluster' and lists generation configurations (Table 19) but provides no requirements.txt, Docker, conda environment, or library versions. 'inspect_evals' is mentioned but no version specified."
     62       },
     63       "reproduction_instructions": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No step-by-step reproduction instructions, README, or scripts provided. The methodology is described but not in a reproducible recipe format."
     67       }
     68     },
     69     "statistical_methodology": {
     70       "confidence_intervals_or_error_bars": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Appendix B (Tables 6-7) reports mean and standard deviation across prompt variations for the robustness analysis. Main results are deterministic (temp=0, fixed seed) so CIs are not applicable there."
     74       },
     75       "significance_tests": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No formal significance tests (p-values, t-tests, etc.) are reported. The paper uses Cohen's d effect sizes and range-standardized effect sizes but no hypothesis testing."
     79       },
     80       "effect_sizes_reported": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Appendix A reports range-standardized effect sizes (ΔM/4) with scaled Euclidean norms for all conditions (Tables 1-5). Appendix B reports Cohen's d between CON_HI and CON_LO (Table 6). Eq. 1 defines the effect size measure."
     84       },
     85       "sample_size_justified": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No justification for why 5 models were selected beyond 'popularity and accessibility.' No power analysis or discussion of whether the sample of models is sufficient for the claims."
     89       },
     90       "variance_reported": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Appendix B reports standard deviations across prompt variations (Tables 6-7). Main results use deterministic decoding (temp=0, seed=43) so single-run results are expected. The robustness analysis provides spread measures."
     94       }
     95     },
     96     "evaluation_design": {
     97       "baselines_included": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "All results are reported as percentage-point changes relative to a neutral/baseline system prompt (no personality conditioning). Raw baseline scores are provided in Tables 8-12."
    101       },
    102       "baselines_contemporary": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The baseline is the model's own default behavior (no personality prompt), which is the appropriate comparison for measuring the effect of personality shaping."
    106       },
    107       "ablation_study": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Individual Big Five traits are varied in isolation (High/Medium/Low), and two composite profiles are tested (dark-triad combination, all-medium). This constitutes a systematic ablation of trait contributions."
    111       },
    112       "multiple_metrics": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Nine benchmark metrics are used: MMLU, TruthfulQA, WMDP (bio/chem/cyber), ETHICS (5 sub-categories), and Sycophancy (2 measures). Plus IPIP-NEO and SD3 personality validation."
    116       },
    117       "human_evaluation": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "No human evaluation of model outputs. All evaluation is automated via benchmark scoring. Given claims about 'safety behavior' and 'ethical behavior,' human assessment of actual model outputs under personality shaping would strengthen the findings."
    121       },
    122       "held_out_test_set": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Standard established benchmark test sets are used (MMLU, TruthfulQA, WMDP, ETHICS, Sycophancy). No tuning is performed on test data."
    126       },
    127       "per_category_breakdown": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Extensive breakdowns: per model (5 models), per trait (5 traits × 3 levels + 2 combinations), per benchmark sub-category (WMDP: bio/chem/cyber; ETHICS: cm/deontology/justice/utilitarianism/virtue). Tables 8-17 and Figure 2 heat maps."
    131       },
    132       "failure_cases_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper discusses where personality shaping has minimal effects (e.g., some models show 'little to no change in MMLU'), model-specific differences, and the GPT-4.1 brittleness pattern. Section 5 notes inconsistencies across model families."
    136       },
    137       "negative_results_reported": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper reports that some trait manipulations have minimal effects (e.g., Llama-4 shows 'small decline' on ETHICS-Deontology with High Conscientiousness while Llama-3-70B shows gains). Not all manipulations produce consistent effects across models."
    141       }
    142     },
    143     "claims_and_evidence": {
    144       "abstract_claims_supported": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Abstract claims about conscientiousness effects on WMDP, TruthfulQA, ETHICS, Sycophancy, and MMLU are supported by results in Tables 8-12. The claim about personality shaping as 'underexplored axis of model control' is supported by the experimental findings."
    148       },
    149       "causal_claims_justified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The study design is interventional: personality prompts are the independent variable, benchmark scores the dependent variable, with a controlled baseline. The causal claims ('reducing conscientiousness leads to drops') are justified by the experimental manipulation. Temperature=0 and fixed seed ensure deterministic outputs."
    153       },
    154       "generalization_bounded": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 6 explicitly bounds findings: 'prompt effects do not transfer uniformly across model families,' notes single prompt template limitation, discusses anthropocentric taxonomy concerns, and acknowledges isolated-trait manipulation as a constraint."
    158       },
    159       "alternative_explanations_discussed": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 5 discusses multiple alternative explanations: prompt framing effects vs genuine trait modulation, capacity sensitivity, safetywashing confounds. Section 6 discusses prompt brittleness and construct under-representation. Appendix C provides extended discussion."
    163       },
    164       "proxy_outcome_distinction": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The paper extensively discusses the gap between benchmark scores and actual safety/capability, engaging with the safetywashing critique (Ren et al. 2024). Section 5 notes 'A benchmark can be both capability-sensitive and personality-sensitive' and discusses what benchmarks actually measure vs. what is claimed."
    168       }
    169     },
    170     "setup_transparency": {
    171       "model_versions_specified": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Table 18 lists specific versions: 'gpt-4.1-2025-04-14', 'Meta-Llama-3-70B-Instruct', 'Meta-Llama-3-8B-Instruct', 'Llama-4-Maverick-17B-128E-Instruct', 'DeepSeek-V3-0324'."
    175       },
    176       "prompts_provided": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Full prompt text for all 32 personality combinations is provided in Appendix D, including the complete preamble structure, trait markers, and postamble for personality validation. The prompt construction method is detailed in Section 3."
    180       },
    181       "hyperparameters_reported": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Table 19 reports all generation parameters: Temperature=0.0, Top P=1.0, Top K=0.0, Frequency Penalty=0.0, Presence Penalty=0.0, Repetition Penalty=1.0, Max Tokens=None, Random Seed=43."
    185       },
    186       "scaffolding_described": {
    187         "applies": false,
    188         "answer": false,
    189         "justification": "No agentic scaffolding is used. Models are evaluated via direct chat completion API calls with personality prompts as system messages."
    190       },
    191       "data_preprocessing_documented": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section 3 details the prompt construction: how trait markers are concatenated with qualifiers, the preamble/postamble structure, and how the 104 adjective pairs map to Big Five dimensions. Section 4 describes how benchmarks are administered. Most evaluations use inspect_evals framework."
    195       }
    196     },
    197     "limitations_and_scope": {
    198       "limitations_section_present": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 6 'Limitations' with four specific subsections: prompt brittleness, isolated-trait manipulation, anthropocentric taxonomy, and safety-capability entanglement."
    202       },
    203       "threats_to_validity_specific": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 6 provides specific threats: single Likert-style system prompt may conflate construct variance with prompt artifacts (Ceron et al. 2024), isolated Big Five dimensions neglect trait-trait couplings (Tett and Burnett 2003), human taxonomy may not span LLM behavioral manifold (Burnell et al. 2023, Suh et al. 2024)."
    207       },
    208       "scope_boundaries_stated": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Section 6 states what was NOT tested: multi-trait adaptive controllers, non-Big-Five factor structures, full personality × capability crossed controls. Appendix C extends with discussion of what is NOT established about symbol grounding and cross-cultural applicability."
    212       }
    213     },
    214     "data_integrity": {
    215       "raw_data_available": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No raw data (individual model responses, per-item scores) is released. Only aggregated benchmark accuracy percentages are provided in tables."
    219       },
    220       "data_collection_described": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Section 4 describes the experimental setup: models accessed via chat completion APIs (OpenRouter, OpenAI, Azure), personality prompts placed as system prompts, standard benchmark evaluation procedures via inspect_evals."
    224       },
    225       "recruitment_methods_described": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "No human participants. The study evaluates LLMs on standard benchmarks."
    229       },
    230       "data_pipeline_documented": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "The pipeline is documented: trait markers → prompt construction (Section 3) → system prompt injection → benchmark evaluation via inspect_evals → accuracy computation. Table 19 provides generation parameters."
    234       }
    235     },
    236     "conflicts_of_interest": {
    237       "funding_disclosed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No funding acknowledgment section found in the paper. No grants, sponsors, or funding agencies mentioned."
    241       },
    242       "affiliations_disclosed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "Author affiliations are listed: Keio University, Universitat Politècnica de València, Center for AI Safety, Carnegie Mellon University. Authors are from academic institutions and a safety research org, not from companies whose products are evaluated."
    246       },
    247       "funder_independent_of_outcome": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "No funding information disclosed, so independence cannot be verified. Co-author Steven Basart is affiliated with Center for AI Safety and is also a co-author on the WMDP and safetywashing papers being evaluated/discussed, creating a potential intellectual conflict."
    251       },
    252       "financial_interests_declared": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "No competing interests statement or financial disclosure found in the paper."
    256       }
    257     },
    258     "contamination": {
    259       "training_cutoff_stated": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No training data cutoff dates stated for any of the five models evaluated. This is important since MMLU, TruthfulQA, and ETHICS are old benchmarks likely in training data."
    263       },
    264       "train_test_overlap_discussed": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No discussion of whether benchmark items appear in model training data. MMLU (2021), TruthfulQA (2022), ETHICS (2020) all predate the models used."
    268       },
    269       "benchmark_contamination_addressed": {
    270         "applies": true,
    271         "answer": false,
    272         "justification": "All benchmarks used (MMLU, TruthfulQA, WMDP, ETHICS) were published before the models' training. No contamination analysis, canary strings, or decontamination discussed."
    273       }
    274     },
    275     "human_studies": {
    276       "pre_registered": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study. All evaluation is automated LLM benchmarking."
    280       },
    281       "irb_or_ethics_approval": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants. Appendix G discusses ethical considerations of the research direction but no IRB approval is relevant."
    285       },
    286       "demographics_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants."
    290       },
    291       "inclusion_exclusion_criteria": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "No human participants."
    295       },
    296       "randomization_described": {
    297         "applies": false,
    298         "answer": false,
    299         "justification": "No human participants."
    300       },
    301       "blinding_described": {
    302         "applies": false,
    303         "answer": false,
    304         "justification": "No human participants."
    305       },
    306       "attrition_reported": {
    307         "applies": false,
    308         "answer": false,
    309         "justification": "No human participants."
    310       }
    311     },
    312     "cost_and_practicality": {
    313       "inference_cost_reported": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No API costs or per-example costs reported. Section 4.1 mentions 'around 24 hours to test all benchmarks for a single model' but no dollar costs or token counts."
    317       },
    318       "compute_budget_stated": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "Section 4.1 states: 'experiments are run on a CPU-only cluster, and it takes around 24 hours to test all benchmarks for a single model.' Table 18 notes API access methods (OpenRouter, OpenAI, Azure)."
    322       }
    323     },
    324     "experimental_rigor": {
    325       "seed_sensitivity_reported": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "A single fixed seed (43) is used with temperature=0 for all experiments (Table 19). No seed sensitivity analysis is performed. The robustness analysis (Appendix B) varies prompt formulations but not random seeds."
    329       },
    330       "number_of_runs_stated": {
    331         "applies": true,
    332         "answer": true,
    333         "justification": "Table 19 specifies Temperature=0.0 and Random Seed=43, implying deterministic single-run evaluation. The robustness analysis explicitly states variations across prompt dimensions."
    334       },
    335       "hyperparameter_search_budget": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No hyperparameter search is reported. A single fixed configuration (Table 19) is used without justification for why these settings were chosen or whether alternatives were explored."
    339       },
    340       "best_config_selection_justified": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "A single configuration is used consistently across all experiments (Table 19). No cherry-picking of configurations — the same parameters apply to all models and conditions."
    344       },
    345       "multiple_comparison_correction": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The study runs hundreds of comparisons (5 models × 17 conditions × 12+ metrics) with no correction for multiple comparisons. Cohen's d is reported but no family-wise error rate correction."
    349       },
    350       "self_comparison_bias_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The authors designed the prompting strategy and evaluate its effects without acknowledging potential experimenter bias in prompt design or interpretation of results."
    354       },
    355       "compute_budget_vs_performance": {
    356         "applies": false,
    357         "answer": false,
    358         "justification": "All conditions use the same compute budget (single API call per item). No compute differential between conditions."
    359       },
    360       "benchmark_construct_validity": {
    361         "applies": true,
    362         "answer": true,
    363         "justification": "The paper extensively discusses benchmark construct validity, engaging with the safetywashing critique (Ren et al. 2024). Section 5 and Appendix C discuss whether safety benchmarks measure actual safety or merely capability, and the paper positions personality as a tool for stress-testing this distinction."
    364       },
    365       "scaffold_confound_addressed": {
    366         "applies": false,
    367         "answer": false,
    368         "justification": "No scaffolding is used. Models are evaluated via direct API calls with system prompts."
    369       }
    370     },
    371     "data_leakage": {
    372       "temporal_leakage_addressed": {
    373         "applies": true,
    374         "answer": false,
    375         "justification": "No discussion of whether models' training data includes benchmark solutions. MMLU (2021), ETHICS (2020), TruthfulQA (2022) all predate the 2024-2025 models used."
    376       },
    377       "feature_leakage_addressed": {
    378         "applies": true,
    379         "answer": false,
    380         "justification": "No discussion of whether the evaluation setup (e.g., multiple-choice format, system prompt structure) leaks information to the model."
    381       },
    382       "non_independence_addressed": {
    383         "applies": true,
    384         "answer": false,
    385         "justification": "No discussion of whether benchmark items share structural similarities with training data or whether the same items appear across benchmarks."
    386       },
    387       "leakage_detection_method": {
    388         "applies": true,
    389         "answer": false,
    390         "justification": "No leakage detection or prevention methods used. No canary strings, membership inference, or decontamination analysis."
    391       }
    392     }
    393   },
    394   "red_flags": [
    395     {
    396       "flag": "Overclaiming in conclusion",
    397       "detail": "Section 7 claims 'our findings put all reported results of safety benchmarks into question' — an extreme generalization from 5 models tested with one prompting strategy. The results show personality sensitivity but do not invalidate all prior safety research."
    398     },
    399     {
    400       "flag": "No contamination awareness",
    401       "detail": "The paper uses old benchmarks (MMLU 2021, ETHICS 2020, TruthfulQA 2022) with models trained in 2024-2025 without any discussion of benchmark contamination. Personality-induced drops could interact with memorization effects."
    402     },
    403     {
    404       "flag": "Single prompt template",
    405       "detail": "Main results rely on a single Likert-style prompt template. While robustness analysis in Appendix B partially addresses this, it only covers GPT-4.1 and Llama-4 — not all five models."
    406     },
    407     {
    408       "flag": "No code or raw data release",
    409       "detail": "Despite the paper's emphasis on reproducibility concerns in AI safety research, the authors do not release their evaluation code or raw per-item results."
    410     },
    411     {
    412       "flag": "Intellectual conflict of interest",
    413       "detail": "Co-author Steven Basart is also a co-author on the WMDP benchmark and the safetywashing paper (Ren et al. 2024) that this paper engages with heavily. This relationship is not disclosed or discussed."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Safetywashing: Do AI Safety Benchmarks Actually Measure Safety Progress?",
    419       "authors": ["Richard Ren", "Steven Basart", "Adam Khoja"],
    420       "year": 2024,
    421       "relevance": "Central reference — this paper directly challenges safetywashing claims by showing personality modulates safety independently of capability."
    422     },
    423     {
    424       "title": "Personality traits in large language models",
    425       "authors": ["Gregory Serapio-García", "Mustafa Safdari"],
    426       "year": 2023,
    427       "arxiv_id": "2307.00184",
    428       "relevance": "Foundational work that this paper extends — established that LLMs exhibit stable Big Five profiles that can be shaped via prompting."
    429     },
    430     {
    431       "title": "The WMDP Benchmark: Measuring and Reducing Malicious Use with Unlearning",
    432       "authors": ["Nathaniel Li", "Alexander Pan"],
    433       "year": 2024,
    434       "arxiv_id": "2403.03218",
    435       "relevance": "Key safety benchmark used in this study; measures hazardous knowledge in LLMs."
    436     },
    437     {
    438       "title": "Towards Understanding Sycophancy in Language Models",
    439       "authors": ["Mrinank Sharma", "Meg Tong"],
    440       "year": 2024,
    441       "arxiv_id": "2310.13548",
    442       "relevance": "Sycophancy benchmark used in this study; personality shaping effects on sycophancy are a key finding."
    443     },
    444     {
    445       "title": "TruthfulQA: Measuring How Models Mimic Human Falsehoods",
    446       "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"],
    447       "year": 2022,
    448       "relevance": "Truthfulness benchmark used in this study; shows personality-induced drops in factual honesty."
    449     },
    450     {
    451       "title": "Measuring Massive Multitask Language Understanding",
    452       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    453       "year": 2021,
    454       "relevance": "MMLU benchmark used as the capability measure in this study."
    455     },
    456     {
    457       "title": "Aligning AI with Shared Human Values",
    458       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    459       "year": 2020,
    460       "arxiv_id": "2008.02275",
    461       "relevance": "ETHICS benchmark used in this study; personality-induced ethics score changes are central findings."
    462     },
    463     {
    464       "title": "Toxicity in ChatGPT: Analyzing Persona-Assigned Language Models",
    465       "authors": ["Ameet Deshpande", "Vishvak Murahari"],
    466       "year": 2023,
    467       "arxiv_id": "2304.05335",
    468       "relevance": "Prior work showing persona assignment affects model toxicity — related prompt-based behavioral control."
    469     },
    470     {
    471       "title": "WalledEval: A Comprehensive Safety Evaluation Toolkit for Large Language Models",
    472       "authors": ["Prannaya Gupta"],
    473       "year": 2024,
    474       "arxiv_id": "2408.03837",
    475       "relevance": "Safety evaluation toolkit with style-mutated variants for probing confounds — complements personality-based stress testing."
    476     },
    477     {
    478       "title": "SPRIG: Improving Large Language Model Performance by System Prompt Optimization",
    479       "authors": ["Lechen Zhang"],
    480       "year": 2024,
    481       "arxiv_id": "2410.14826",
    482       "relevance": "System prompt optimization work showing power and volatility of system-level prompt control."
    483     },
    484     {
    485       "title": "Revealing the structure of language model capabilities",
    486       "authors": ["Ryan Burnell", "Han Hao"],
    487       "year": 2023,
    488       "arxiv_id": "2306.10062",
    489       "relevance": "Factor analysis of LLM capabilities revealing latent structure that may not align with human personality taxonomy."
    490     },
    491     {
    492       "title": "Jailbroken: How Does LLM Safety Training Fail?",
    493       "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
    494       "year": 2024,
    495       "relevance": "Safety training failure modes — personality shaping represents a related but distinct attack vector."
    496     }
    497   ]
    498 }

Impressum · Datenschutz