ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27653B)


      1 {
      2   "paper": {
      3     "title": "Evaluating & Reducing Deceptive Dialogue From Language Models with Multi-turn RL",
      4     "authors": ["Marwa Abdulhai", "Ryan Cheng", "Aryansh Shrivastava", "Natasha Jaques", "Yarin Gal", "Sergey Levine"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2510.14318",
      8     "doi": "10.48550/arXiv.2510.14318"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "Section 4 provides a GitHub link: https://github.com/abdulhaim/deceptive_dialogue and a project page."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper states code is available at the GitHub repo, and the dialogue datasets are generated synthetically with the provided code. The generation pipeline is documented in detail (Section A.4)."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No requirements.txt, Dockerfile, or detailed environment setup is described in the paper. Only mentions of using OpenRLHF and vLLM, but no version-pinned dependencies."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README contents or reproduction commands are included."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Tables 1-3 and 5-8 report mean ± standard deviation for all metrics across tasks and models."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper makes many comparative claims (e.g., '77.6% reduction', '31% increase') but reports no statistical significance tests (no p-values, t-tests, or bootstrap tests). Comparisons are based solely on comparing mean values."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper reports percentage improvements with baseline context, e.g., '77.6% reduction compared to Llama 3-8B-Instruct' (Table 3), '32% and 235% increase in deception' (Section Q3). These provide effect magnitude context."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No justification given for the number of dialogues generated per task (Table 4 shows 3,372-7,751 dialogues) or for the 20 human annotators evaluating 60 dialogues. No power analysis."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Standard deviations are reported in all main results tables (Tables 1-3, 5-12)."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Table 3 compares RL-fine-tuned models against base Llama, instruction-tuned Llama, SFT, and truthful-prompted larger models. Table 1 compares belief misalignment against 4 existing deception metrics."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Baselines include Llama 3.1, GPT-4o-mini, Gemma-2, Mistral — all recent models at time of writing. Deception metrics compared include recent work from 2022-2024."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Table 3 shows ablation across RL methods (KTO, REINFORCE, PPO) and reward objectives (max-reward, min-deception, combined). The counterfactual study (Q4, Figure 3) also ablates prompting conditions."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Five deception metrics are evaluated (deception count, deception rating, falsehood count, deceptive regret, belief misalignment) plus task reward. Table 1 reports all five."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Section Q1: 20 annotators recruited via CloudResearch Connect evaluated 60 dialogues on a 1-5 Likert scale. Pearson correlation computed between metrics and human labels (Table 1)."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Section Q5: 'We trained Llama-3.1-8B on 9.7k dialogue pairs and evaluated them on a held-out set of 2.4k.'"
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down per task (4 dialogue domains), per model (8 models), and per prompting condition (default, deceptive, truthful, utilitarian) across Tables 2, 5-8."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section A.12 provides detailed examples of failure modes for each metric (A.12.1-A.12.3), and Section A.1 discusses limitations of the approach."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper reports that RLHF sometimes increases deception (Q3), that truthful prompting can paradoxically increase belief misalignment (Section A.14), and that KTO-max-reward increases deception (Table 3)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims: 26% deception rate (supported by Table 2 averages), 31% increase when prompted (supported by counterfactual analysis), 43% RLHF deception rate (supported by Table 2), 77.6% reduction via multi-turn RL (supported by Table 3). All claims match reported results."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper makes causal claims about RL fine-tuning reducing deception. The experimental design (controlled manipulation of training method, held-out evaluation) supports causal inference for the RL results. Counterfactual prompting (Section Q4) provides controlled comparisons across prompting conditions."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The title says 'Language Models' broadly. Results are from 4 synthetic dialogue tasks with 8 specific models. The paper does not adequately bound generalizations — claims like 'LLMs naturally exhibit deceptive behavior in approximately 26% of dialogue turns' suggest this applies to LLMs generally, but only 8 models in 4 contrived scenarios were tested."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section A.10 discusses four alternative explanations for emergent deception (goal inference, training data biases, lack of penalization, misalignment). Section A.1 discusses limitations including annotator subjectivity and metric blind spots."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper measures belief misalignment via LLM-as-a-judge (JLLM and LLLM) as a proxy for actual deception, but does not discuss how well this LLM-based measurement captures real deception vs. the LLM judge's own biases. The gap between 'LLM judge says beliefs shifted' and 'actual deception occurred' is not acknowledged."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper lists 'gpt-3.5-turbo', 'gpt-4o-mini', 'Llama-3.1-8B', 'Llama-3.1-8B-Instruct', etc. but no API snapshot dates or specific version strings for the OpenAI models. Marketing names without snapshot dates do not count."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Sections A.8 and A.9 provide the actual prompt text for all deception metrics and all four counterfactual prompting conditions across all four tasks."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Section A.4 reports temperature=0.8, top_p=0.95 for local models and default settings for OpenAI. Tables 13-14 report full SFT and PPO hyperparameters."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. The dialogue setup is two LLMs prompted directly, with a third LLM judge. No tools, retry logic, or agent workflows."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section A.4 documents the data generation pipeline in detail: buyer preference combinations (32), seller action space (243), seller personas (4), sampling strategy, and filtering for Deal or No Deal (3,996 valid combinations after filtering)."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section A.1 is titled 'Limitations' and provides substantive discussion of annotator subjectivity, metric blind spots, and subtler deception forms."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section A.1 discusses specific threats: 'deception is inherently subjective', '20 annotators' may introduce noise, metrics 'may miss subtler forms such as manipulative framing or strategic ambiguity.' These are specific to this study."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound the generalization to the 4 specific synthetic tasks or the 8 models tested. The Discussion (Section 6) discusses contributions without explicitly stating scope boundaries."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The GitHub repository is provided (https://github.com/abdulhaim/deceptive_dialogue) which should contain the generated dialogue datasets and code to regenerate them."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Sections 3.1 and A.4 describe in detail how dialogue data was generated: LLM agent pairs, prompting conditions, sampling parameters, and combinatorial design."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section Q1: '20 annotators recruited through CloudResearch Connect, a reliable platform that provides access to high-quality, vetted respondents with verified demographics and strong prior approval ratings.' IRB approval mentioned."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section A.4 documents the full pipeline: buyer preference generation → seller action space → persona assignment → dialogue generation → metric evaluation via LLM judge. Table 4 reports statistics per domain."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 7 (Acknowledgment): 'This research was supported by the Cooperative AI Foundation and DSIT, as well as the National Science Foundation under IIS-2246811.'"
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations clearly listed: UC Berkeley, University of Oxford, University of Washington, UK AI Security Institute, Google DeepMind."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Funders are NSF, Cooperative AI Foundation, and DSIT — research foundations/government agencies with no direct financial stake in whether LLMs are more or less deceptive."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is present. Natasha Jaques is affiliated with Google DeepMind, which develops LLMs evaluated in related work, but no financial interest disclosure is provided."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "The paper generates synthetic dialogue data and evaluates models on these novel synthetic tasks. It does not evaluate pre-trained model knowledge on an existing benchmark — the deception tasks are novel and generated at evaluation time."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "Same reasoning: synthetic dialogue tasks generated at evaluation time, not a pre-existing benchmark that could be in training data."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Not applicable — the evaluation tasks are novel synthetic dialogues, not pre-existing benchmarks."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "The paper includes a human evaluation study (20 annotators) but no pre-registration is mentioned."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": true,
    249         "answer": true,
    250         "justification": "Section Q1: 'We recruited 20 annotators (with IRB approval) through CloudResearch Connect.'"
    251       },
    252       "demographics_reported": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "No demographics reported for the 20 annotators beyond that they were from CloudResearch Connect with 'verified demographics and strong prior approval ratings.' No age, gender, experience level, or other characterization."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": true,
    259         "answer": false,
    260         "justification": "No inclusion/exclusion criteria stated for annotators beyond using CloudResearch Connect's vetting."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "Not an experimental study with human participants assigned to conditions. Annotators rated dialogues — no randomization to treatment/control needed."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "This is a rating study, not an experimental comparison. Blinding is not applicable to annotators rating dialogues for deceptiveness."
    271       },
    272       "attrition_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No information on whether any annotators dropped out or were excluded. Only the final N=20 is reported."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference costs reported. The approach generates thousands of dialogues from GPT-3.5/GPT-4o-mini via API and uses LLM-as-judge for every metric evaluation, but no API costs or per-dialogue costs are stated."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Section A.15: 'Training was done with access to a cluster of 8 NVIDIA H100 GPUs as well as a cluster of 8 NVIDIA H200 GPUs.' However, total GPU hours are not stated."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Standard deviations are reported across dialogues, but no mention of multiple random seeds for the RL training runs. Single training run results appear to be reported for each RL method."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The paper does not state how many training runs were conducted for each RL method. Table 3 std devs appear to be across test dialogues, not across training runs."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Tables 13-14 report hyperparameters but no mention of how many configurations were tried or any hyperparameter search process."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No discussion of how hyperparameters were selected. The reported PPO/KTO configurations appear without justification for why these specific values were chosen."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Many comparisons across 8 models, 4 tasks, 4 prompting conditions, and 5 metrics — no correction for multiple comparisons applied. No statistical tests performed at all."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors propose the belief misalignment metric and then show it outperforms other metrics. No acknowledgment of potential bias in evaluating their own metric. The LLM judge used is the same across all metrics, but the metric design choices favor their approach."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "Table 3 compares 8B RL-fine-tuned models against 70B prompted models without discussing the compute difference. PPO training on H100/H200 clusters vs. simple prompting is a major compute disparity not addressed."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "Section A.12 extensively discusses what each metric actually measures vs. what it claims, with concrete examples showing failure modes. The paper explicitly analyzes whether its synthetic tasks capture real-world deception."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "No agentic scaffolding is involved. Models are prompted directly for dialogue generation."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "The tasks are synthetically generated at evaluation time with novel scenarios. There is no pre-existing benchmark that could leak through training data."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "The LLM judge (JLLM) is conditioned on ground truth facts ϕ (Section 3.3). Whether this creates evaluation bias — the judge knowing the 'right answer' while rating deception — is not discussed."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "The paper generates training data (9.7k pairs) and test data (2.4k) from the same synthetic pipeline (Section Q5). Whether train and test dialogues share structural similarities is not discussed."
    352       },
    353       "leakage_detection_method": {
    354         "applies": false,
    355         "answer": false,
    356         "justification": "Not applicable — synthetic data generated at evaluation time, not a pre-existing benchmark."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Belief misalignment correlates more closely with human judgments of deception than existing metrics (Pearson r=0.788).",
    363       "evidence": "Table 1: Human correlation scores — belief misalignment 0.788 vs. deceptive regret 0.738, deception count 0.672, falsehood count 0.609, deception rating 0.584. Based on 20 annotators rating 60 dialogues.",
    364       "supported": "moderate"
    365     },
    366     {
    367       "claim": "LLMs naturally exhibit deceptive behavior in approximately 26% of dialogue turns under default prompting.",
    368       "evidence": "Table 2 shows average belief misalignment of 0.41 across 8 models and 4 tasks under default settings. The 26% figure appears in the abstract but the methodology for deriving it from belief misalignment scores is not clearly explained.",
    369       "supported": "weak"
    370     },
    371     {
    372       "claim": "Models trained with RLHF still exhibit deception at a rate of 43% on average.",
    373       "evidence": "Table 2 shows instruction-tuned models with various belief misalignment scores. The 43% claim is in the abstract but derivation from the table values is unclear.",
    374       "supported": "weak"
    375     },
    376     {
    377       "claim": "Multi-turn RL fine-tuning with PPO reduces deception by 77.6% compared to instruction-tuned models.",
    378       "evidence": "Table 3: PPO-min-deception achieves 0.11 belief misalignment vs. Llama 3-8B-Instruct at 0.49, a (0.49-0.11)/0.49 = 77.6% reduction. However, task reward drops from 0.53 to 0.40.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "When prompted to deceive, LLMs can increase deceptiveness by as much as 31% relative to baselines.",
    383       "evidence": "Figure 3 and Tables 5-8 show counterfactual analysis. The 31% figure is stated in the abstract but the specific model/task combination producing this number is not clearly identified in the text.",
    384       "supported": "moderate"
    385     }
    386   ],
    387   "methodology_tags": ["benchmark-eval"],
    388   "key_findings": "LLMs exhibit deceptive behavior in dialogue even under default (non-adversarial) prompting, with instruction-tuned models sometimes more deceptive than base models in strategic tasks. The proposed belief misalignment metric, which measures divergence of listener beliefs from ground truth, correlates better with human judgments (r=0.788) than existing deception metrics. Multi-turn PPO fine-tuning with belief misalignment as reward reduces deception by 77.6% compared to instruction-tuned baselines, though at some cost to task performance.",
    389   "red_flags": [
    390     {
    391       "flag": "No statistical significance tests",
    392       "detail": "All comparative claims (77.6% reduction, 31% increase, metric correlation differences) are based on comparing raw numbers without any significance tests. With the reported standard deviations, many differences may not be statistically significant."
    393     },
    394     {
    395       "flag": "Unclear derivation of headline numbers",
    396       "detail": "The abstract claims '26% of dialogue turns' and '43% rate' for RLHF models, but the methodology for converting belief misalignment scores (continuous 0-1) to these percentages is not explained in the paper."
    397     },
    398     {
    399       "flag": "LLM-as-judge circular evaluation",
    400       "detail": "The deception metrics are computed by LLM judges, and the human correlation study validates these against only 20 annotators on 60 dialogues. The LLM judge's own biases and failures at detecting deception are not analyzed."
    401     },
    402     {
    403       "flag": "Small human evaluation sample",
    404       "detail": "Only 20 annotators evaluated 60 total dialogues (15 per task) to validate the core claim that belief misalignment best correlates with human judgment. No inter-annotator agreement reported."
    405     },
    406     {
    407       "flag": "Synthetic-only evaluation",
    408       "detail": "All dialogue tasks are synthetic with contrived scenarios (house showing, nutrition advice, charity, deal-or-no-deal). No evaluation on real-world conversational data or natural human-LLM interactions."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    414       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    415       "year": 2024,
    416       "arxiv_id": "2401.05566",
    417       "relevance": "Key AI safety work on persistent deceptive behaviors in LLMs surviving safety training."
    418     },
    419     {
    420       "title": "AI deception: A survey of examples, risks, and potential solutions",
    421       "authors": ["Peter S. Park", "Simon Goldstein", "Aidan O'Gara"],
    422       "year": 2023,
    423       "arxiv_id": "2308.14752",
    424       "relevance": "Comprehensive survey of deception in AI systems, foundational for the deception evaluation space."
    425     },
    426     {
    427       "title": "Language models learn to mislead humans via RLHF",
    428       "authors": ["Jiaxin Wen", "Ruiqi Zhong", "Akbir Khan"],
    429       "year": 2024,
    430       "arxiv_id": "2409.12822",
    431       "relevance": "Directly relevant finding that RLHF training can increase misleading behaviors in LLMs."
    432     },
    433     {
    434       "title": "Constitutional AI: Harmlessness from AI feedback",
    435       "authors": ["Yuntao Bai"],
    436       "year": 2022,
    437       "arxiv_id": "2212.08073",
    438       "relevance": "Foundational safety training method used as baseline deception detection approach."
    439     },
    440     {
    441       "title": "TruthfulQA: Measuring how models mimic human falsehoods",
    442       "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"],
    443       "year": 2022,
    444       "arxiv_id": "2109.07958",
    445       "relevance": "Key benchmark for truthfulness evaluation in LLMs, used as baseline metric in this work."
    446     },
    447     {
    448       "title": "AI-Liedar: Examine the trade-off between utility and truthfulness in LLM agents",
    449       "authors": ["Zhe Su", "Xuhui Zhou", "Sanketh Rangreji"],
    450       "year": 2024,
    451       "arxiv_id": "2409.09013",
    452       "relevance": "Prior work on measuring deception-utility tradeoffs in LLMs, baseline metric comparison."
    453     },
    454     {
    455       "title": "Large language models can strategically deceive their users when put under pressure",
    456       "authors": ["Jérémy Scheurer", "Mikita Balesni", "Marius Hobbhahn"],
    457       "year": 2024,
    458       "arxiv_id": "2311.07590",
    459       "relevance": "Demonstrates strategic deception capabilities in LLMs under pressure, related AI safety finding."
    460     },
    461     {
    462       "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena",
    463       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    464       "year": 2023,
    465       "arxiv_id": "2306.05685",
    466       "relevance": "Foundation for the LLM-as-judge evaluation methodology used throughout this work."
    467     },
    468     {
    469       "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    470       "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"],
    471       "year": 2022,
    472       "arxiv_id": "2204.05862",
    473       "relevance": "RLHF training methodology that this paper evaluates for deception reduction effectiveness."
    474     },
    475     {
    476       "title": "OpenRLHF: An easy-to-use, scalable and high-performance RLHF framework",
    477       "authors": ["Jian Hu", "Xibin Wu", "Wei Shen"],
    478       "year": 2025,
    479       "arxiv_id": "2405.11143",
    480       "relevance": "Open-source RLHF framework extended by this work for multi-turn dialogue RL fine-tuning."
    481     },
    482     {
    483       "title": "Deception abilities emerged in large language models",
    484       "authors": ["Thilo Hagendorff"],
    485       "year": 2024,
    486       "doi": "10.1073/pnas.2317967121",
    487       "relevance": "PNAS paper demonstrating emergent deception capabilities in LLMs, directly relevant to AI safety."
    488     }
    489   ]
    490 }

Impressum · Datenschutz