scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32969B)
      1 {
      2   "paper": {
      3     "title": "The Traitors: Deception and Trust in Multi-Agent Language Model Simulations",
      4     "authors": ["Pedro M. P. Curvo"],
      5     "year": 2025,
      6     "venue": "NeurIPS 2025 Workshop: Multi-Turn Interactions in Large Language Models",
      7     "arxiv_id": "2505.12923",
      8     "doi": "10.48550/arXiv.2505.12923"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "case-study"],
     13   "key_findings": "The paper introduces 'The Traitors,' a multi-agent simulation framework for studying deception and trust dynamics among LLM agents in a social deduction game setting. Initial experiments with DeepSeek-V3, GPT-4o-mini, and GPT-4o (10 runs each) reveal an asymmetry: GPT-4o achieves 93% traitor survival rate but only 10% faithful correctness rate, suggesting deception capabilities may scale faster than detection abilities. All models achieve perfect traitor agreement scores but vary substantially in faithful coordination.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "GitHub repository provided at https://github.com/pedrocurvo/TheTraitors (footnote 1, page 1). Also stated in Section 5: 'By releasing the full environment, we aim to enable broader experimentation.'"
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No simulation data (transcripts, voting logs, agent memory states) is released despite comprehensive logging being mentioned in Section 1.2. Only aggregated results in Table 1 are provided."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper states experiments were run 'via public LLM APIs' (Section 3.1) but provides no requirements.txt, Dockerfile, or library version specifications for the simulation framework."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. While the code is released, the paper does not include a reproduction guide, specific commands, or configuration files needed to replicate the reported results."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Table 1 reports all metrics with ± standard deviation (e.g., 'TSR: 0.93 ± 0.13' for GPT-4o). Standard deviations are computed across 10 independent runs."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The authors explicitly acknowledge 'our sample size precludes definitive statistical significance testing' (Section 3.2). No p-values, t-tests, or other significance tests are reported despite making comparative claims between models."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Raw metric values are reported but no formal effect sizes (Cohen's d, odds ratios, etc.) are computed. Differences between models are discussed qualitatively without effect size quantification."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The choice of 10 runs per model is attributed to 'computational resource constraints' (Section 5) but no power analysis or formal justification for why 10 runs is adequate for the claims made."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Standard deviations across 10 runs are reported for all metrics in Table 1 (e.g., 'FAS: 0.83 ± 0.09' for DeepSeek-V3)."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Three models are compared against each other: DeepSeek-V3 (open-weight), GPT-4o-mini, and GPT-4o (closed-weight). Each serves as a baseline for the others (Table 1)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "All three models (DeepSeek-V3, GPT-4o-mini, GPT-4o) are contemporary state-of-the-art models at time of publication."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No ablation study is performed. The system has multiple components (persistent memory, role-specific prompts, discussion phases, voting mechanisms) but none are individually tested for contribution to the results."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Ten distinct metrics are used across three categories: coordination (TAS, FAS), effectiveness (FCR, TSR, FSR, DES), and behavioral (IDR, BRR, VSF, TNS), defined formally in Appendix B."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of the simulation outputs (e.g., human rating of deception quality, persuasiveness of agent utterances, or naturalness of dialogue). All evaluation is automated via the defined metrics."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "This is a simulation study with fixed prompts and no tuning process. There is no training/development/test split concept applicable here — the prompts are designed a priori and all 10 runs per model are evaluated identically."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 1 provides per-metric breakdowns across three categories (Coordination, Effectiveness, Behavioral) and three models, rather than a single aggregate score."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 3.2 discusses failure modes: GPT-4o faithful agents' very low detection accuracy (FCR: 0.10), consistently low Betrayal Recognition Rates across all models (0.10-0.16), and DeepSeek-V3's trust network volatility. Appendix D.4.1 shows a qualitative example of coordinated traitor deception succeeding."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports the counterintuitive negative result that GPT-4o, despite being the most capable model, performs worst at faithful detection (FCR: 0.10 vs ~0.55 for others). The asymmetry between deception and detection capability is itself a negative finding for AI safety."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract's main claims — that GPT-4o shows 'superior deceptive capabilities yet exhibits disproportionate vulnerability to others' falsehoods' and that 'deception skills may scale faster than detection abilities' — are supported by Table 1 data (GPT-4o TSR: 0.93 vs FCR: 0.10). The hedging language ('suggests,' 'may') is appropriate given the limited experiments."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes causal-sounding claims: 'This asymmetry likely stems from differences in language generation capabilities that affect persuasive communication' (Section 3.2). This is a causal claim from observational comparison of 3 models without controlling for confounds like prompt sensitivity, training data differences, or RLHF alignment effects."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper draws broad conclusions about 'deception capabilities scaling faster than detection abilities' and implications for 'AI safety' from tests of only 3 models in a single game configuration with homogeneous agent populations. Section 6 states findings 'raises important questions about the future trajectory of strategic social reasoning in AI systems' without adequate bounding to the specific tested setting."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not consider alternative explanations for GPT-4o's high TSR / low FCR. Possible alternatives include: GPT-4o's RLHF training making it more compliant with deceptive role instructions, prompt sensitivity differences, model-specific tendencies to agree or dissent, or the specific prompt design favoring certain model architectures."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures game-specific metrics (voting patterns, survival rates) but frames them as measuring general 'deception capability,' 'detection ability,' and 'trust dynamics' with implications for AI safety. The gap between game performance in a specific social deduction setup and general deceptive capability is not acknowledged."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Models are referred to only as 'DeepSeek-V3,' 'GPT-4o-mini,' and 'GPT-4o' without API snapshot dates or version identifiers. The footnote acknowledges 'API models may introduce variability' but does not provide specific versions used."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompt text for both traitor and faithful agents is provided in Appendix D.1. Discussion prompts, traitor meeting prompts, voting prompts, and post-elimination reflection prompts are also shown in Appendix D.3."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.1 reports 'stochastic decoding with a temperature of T = 0.7 and nucleus sampling with top_p = 0.9.' The game configuration parameters (10 agents, 3 traitors, δ = 1) are also specified."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The agent memory architecture is described in detail in Section 2.2.5 (categorized memory entries, belief tracking, chronological event history, strategic considerations). The game flow (night/day/voting phases) and agent observation/policy functions are formally defined. Appendix D.2 details the memory structure."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "While metrics are formally defined in Appendix B, the actual pipeline from raw simulation logs (transcripts, votes, memory states) to the aggregated statistics in Table 1 is not documented. No description of how raw outputs were processed, whether any runs were excluded, or how metric computation was implemented."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5 'Limitations' is a dedicated section covering resource constraints, evaluation challenges, and implementation constraints. Additional future directions are discussed in Appendix E."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 5 discusses specific threats: limited runs (10 per model) due to API costs, inability to distinguish 'strategic misdirection from reasoning failures,' reliance on 'off-the-shelf LLMs with no cross-game learning,' homogeneous agent populations, and behavior shaped primarily by prompt engineering."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states 'the presented findings should be regarded as preliminary rather than definitive' (Section 5) and acknowledges 'the experiments presented here serve primarily as proof-of-concept demonstrations of the framework's capabilities, rather than as comprehensive behavioral studies' (Section 1)."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw simulation data is released. Complete dialogue transcripts, voting records, agent memory states, and elimination patterns are not made available despite being described as comprehensively logged (Section 1.2)."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3.1 describes the simulation setup: 10 agents (3 traitors, 7 faithful), 10 independent trials per model with different random seeds, sequential dialogue, text-based communication, comprehensive logging of elimination patterns, dialogue transcripts, voting behavior, and survival statistics."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants are involved. All agents are LLM instances. Recruitment methods do not apply."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The pipeline from raw simulation outputs to the reported metrics is not documented. While individual metrics are formally defined (Appendix B), the actual computation process, any data cleaning, and aggregation steps are not described."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source or acknowledgments section is present in the paper. The work is by a student at University of Amsterdam but no statement about whether the work was funded or unfunded."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliation (University of Amsterdam, Science Park 904) is clearly stated on the first page. The author is not affiliated with any of the model providers being evaluated."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": false,
    217         "answer": false,
    218         "justification": "No funding is disclosed. This appears to be unfunded student research at a university — the author is not affiliated with any evaluated model provider."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for any of the three models (DeepSeek-V3, GPT-4o-mini, GPT-4o). This is relevant because the models may have been trained on extensive descriptions of social deduction game strategies (Mafia, Werewolf, Avalon) that could influence their behavior."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether models have been exposed to social deduction game strategies, Mafia/Werewolf gameplay transcripts, or similar game-theoretic reasoning in their training data. This exposure could significantly bias model behavior in the game setting."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Not addressed. While the specific game transcripts are dynamically generated, the strategic patterns the paper claims to study (deception strategies, trust building, coalition formation) may be heavily influenced by training data containing similar game scenarios."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants. All agents are LLM instances."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. All agents are LLM instances."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. All agents are LLM instances."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants. All agents are LLM instances."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants. All agents are LLM instances."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants. All agents are LLM instances."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants. All agents are LLM instances."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Section 3.1 reports 'Each simulation takes approximately 10 minutes to complete and consumes an average of 500,000 tokens.' This provides both time and token cost per simulation run."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Per-simulation costs are given (10 min, 500K tokens) but total computational budget across all 30 simulations is not stated. No total API spend or aggregate compute cost is reported."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Results are reported across 10 independent simulations with 'different random seeds to account for stochastic variation' (Section 3.1). Mean and standard deviation across seeds are provided for all metrics in Table 1."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Explicitly stated: '10 independent trials' per model (Section 3.1). 'All results represent averages across 10 independent simulations, reported as mean ± standard deviation' (Section 3.2)."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search is described. The game configuration (10 agents, 3 traitors) and decoding parameters (T=0.7, top_p=0.9) appear chosen without systematic search. No search budget or method is reported."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "A single predefined configuration is used (10 agents, 3 traitors, full role revelation). All results from this configuration are reported without selective reporting. The configuration choice is explained as the 'baseline configuration' (Section 3.1)."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Three models are compared on 10 metrics with no multiple comparison correction. No statistical tests are performed at all, yet comparative conclusions are drawn from the metric differences."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors designed the framework, the prompts, and the metrics, then evaluated models on their own system. The potential bias of evaluating models in an environment designed by the authors is not acknowledged or discussed."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The three models (DeepSeek-V3, GPT-4o-mini, GPT-4o) have very different computational costs and capabilities, but performance is not reported as a function of compute budget. Cost differences between API calls to different models are not discussed."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper does not discuss whether game metrics (voting-based TAS, FCR, TSR etc.) actually measure the constructs claimed (general 'deception capability,' 'trust dynamics'). The gap between game-specific performance and the broader AI safety implications drawn from it is not examined."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "All models use identical scaffolding: the same memory architecture, game management code, prompt templates, and interaction structure. The scaffold is held constant across model comparisons, isolating the model as the variable."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "Not discussed. The models may have been trained on descriptions of social deduction game strategies, optimal Mafia/Werewolf play, and game theory literature that directly informs the behaviors being evaluated."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "Not discussed. Traitor agents receive complete role information by design (part of the game), but the broader question of whether training data exposure to game strategies constitutes feature leakage for behavioral evaluation is not addressed."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Not discussed. All 10 runs per model use the same prompts, same game configuration, and same model — these runs share substantial structural similarity that could inflate apparent consistency of results."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is applied. No canary strings, temporal splits, or decontamination pipelines are used."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Deception capabilities may scale faster than detection abilities in LLMs.",
    365       "evidence": "Table 1: GPT-4o (most capable model) achieves TSR 0.93 ± 0.13 but only FCR 0.10 ± 0.09, while less capable models (DeepSeek-V3, GPT-4o-mini) show TSR 0.33 ± 0.37 with FCR ~0.55. Section 3.2 discusses this asymmetry.",
    366       "supported": "weak"
    367     },
    368     {
    369       "claim": "All models demonstrate perfect traitor coordination (TAS: 1.00) across all runs.",
    370       "evidence": "Table 1: TAS = 1.00 ± 0.00 for all three models across 10 runs each.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "GPT-4o demonstrates superior deceptive capabilities yet exhibits disproportionate vulnerability to others' falsehoods.",
    375       "evidence": "Table 1: GPT-4o TSR 0.93 vs 0.33 for other models; GPT-4o FCR 0.10 vs ~0.55 for other models. Section 3.2 interprets this as 'GPT-4o may generate more persuasive deceptive communication that successfully misleads even other GPT-4o agents.'",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Successful traitor identification emerges through group consensus rather than individual insight.",
    380       "evidence": "Table 1: BRR consistently low across all models (0.10-0.16). Section 3.2: 'Betrayal Recognition Rate remained consistently low... indicating that successful traitor identification typically emerges through group consensus.'",
    381       "supported": "weak"
    382     },
    383     {
    384       "claim": "More capable closed-weight models may generate more diverse or independent voting patterns when operating as faithful agents.",
    385       "evidence": "Table 1: GPT-4o FAS 0.58 ± 0.09 vs DeepSeek-V3 FAS 0.83 ± 0.09. Section 3.2 discusses this as potentially reflecting 'more diverse or independent voting patterns.'",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Deception Effectiveness Score is perfect (1.00) for all models, indicating traitors always successfully orchestrate faithful eliminations.",
    390       "evidence": "Table 1: DES = 1.00 ± 0.00 for all three models.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Tiny sample size for breadth of claims",
    397       "detail": "Only 10 runs per model with 3 models tested. The authors acknowledge this precludes significance testing, yet still draw conclusions about general scaling of deception vs detection capabilities and AI safety implications."
    398     },
    399     {
    400       "flag": "No significance tests despite comparative claims",
    401       "detail": "Claims like 'GPT-4o demonstrates superior deceptive capabilities' and 'deception capabilities may scale faster than detection abilities' are made without any statistical testing. Given the high variance (e.g., TSR 0.33 ± 0.37 for DeepSeek-V3 and GPT-4o-mini), these differences may not be statistically significant."
    402     },
    403     {
    404       "flag": "Suspicious metric uniformity",
    405       "detail": "DES = 1.00 ± 0.00 and TAS = 1.00 ± 0.00 for ALL models suggests these metrics may not be discriminating or may reflect game design artifacts rather than meaningful model differences. A metric with zero variance across all conditions provides no diagnostic value."
    406     },
    407     {
    408       "flag": "Homogeneous populations only",
    409       "detail": "All experiments use homogeneous agent populations (same model for all agents). The framework claims to support heterogeneous configurations but none are tested. This means results reflect model-self-interaction dynamics, which may differ substantially from mixed-model scenarios."
    410     },
    411     {
    412       "flag": "Claims significantly outrun evidence",
    413       "detail": "The paper draws broad AI safety conclusions about deceptive alignment, alignment challenges, and the 'future trajectory of strategic social reasoning in AI systems' from a proof-of-concept study with 3 models in a single game configuration. The gap between evidence and implications is very large."
    414     },
    415     {
    416       "flag": "Model versions not specified",
    417       "detail": "API models (GPT-4o, GPT-4o-mini) change behavior across versions. Without snapshot dates, the results are not reproducible and may already be outdated."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "AvalonBench: Evaluating LLMs playing the game of Avalon",
    423       "authors": ["J. Light", "M. Cai", "S. Shen", "Z. Hu"],
    424       "year": 2023,
    425       "arxiv_id": "2310.05036",
    426       "relevance": "Directly comparable framework for evaluating LLMs in social deduction games (Avalon), establishing baselines for multi-agent game evaluation."
    427     },
    428     {
    429       "title": "Exploring large language models for communication games: An empirical study on werewolf",
    430       "authors": ["Y. Xu", "S. Wang", "P. Li", "F. Luo", "X. Wang", "W. Liu", "Y. Liu"],
    431       "year": 2024,
    432       "arxiv_id": "2309.04658",
    433       "relevance": "Studies LLM behavior in Werewolf, a closely related social deduction game, providing comparison points for deception and detection capabilities."
    434     },
    435     {
    436       "title": "Avalon's game of thoughts: Battle against deception through recursive contemplation",
    437       "authors": ["S. Wang", "C. Liu", "Z. Zheng", "S. Qi", "S. Chen", "Q. Yang", "A. Zhao", "C. Wang", "S. Song", "G. Huang"],
    438       "year": 2023,
    439       "arxiv_id": "2310.01320",
    440       "relevance": "Explores recursive reasoning and deception in LLM agents playing Avalon, relevant to understanding deception mechanisms in social games."
    441     },
    442     {
    443       "title": "Among them: A game-based framework for assessing persuasion capabilities of LLMs",
    444       "authors": ["M. Idziejczak", "V. Korzavatykh", "M. Stawicki", "A. Chmutov", "M. Korcz", "I. Błądek", "D. Brzeziński"],
    445       "year": 2025,
    446       "arxiv_id": "2502.20426",
    447       "relevance": "Closely related framework using Among Us-style game to assess LLM persuasion capabilities, combining task completion with social deduction."
    448     },
    449     {
    450       "title": "Cooperate or collapse: Emergence of sustainable cooperation in a society of LLM agents",
    451       "authors": ["G. Piatti", "Z. Jin", "M. Kleiman-Weiner", "B. Schölkopf", "M. Sachan", "R. Mihalcea"],
    452       "year": 2024,
    453       "arxiv_id": "2404.16698",
    454       "relevance": "Studies cooperative and competitive dynamics among LLM agents in multi-agent simulations, complementary to the adversarial dynamics studied here."
    455     },
    456     {
    457       "title": "TruthfulQA: Measuring how models mimic human falsehoods",
    458       "authors": ["S. Lin", "J. Hilton", "O. Evans"],
    459       "year": 2022,
    460       "arxiv_id": "2109.07958",
    461       "relevance": "Benchmark for measuring LLM truthfulness, relevant to the broader question of when and why LLMs produce deceptive or false outputs."
    462     },
    463     {
    464       "title": "Risks from learned optimization in advanced machine learning systems",
    465       "authors": ["E. Hubinger", "C. van Merwijk", "V. Mikulik", "J. Skalse", "S. Garrabrant"],
    466       "year": 2019,
    467       "arxiv_id": "1906.01820",
    468       "relevance": "Foundational AI safety paper on deceptive alignment risk — the scenario where AI systems appear aligned during training but harbor hidden objectives."
    469     },
    470     {
    471       "title": "Concrete problems in AI safety",
    472       "authors": ["D. Amodei", "C. Olah", "J. Steinhardt", "P. Christiano", "J. Schulman", "D. Mané"],
    473       "year": 2016,
    474       "arxiv_id": "1606.06565",
    475       "relevance": "Seminal AI safety paper identifying core safety challenges including reward hacking and distributional shift, contextualizing the deception concerns studied here."
    476     },
    477     {
    478       "title": "Human-level play in the game of diplomacy by combining language models with strategic reasoning",
    479       "authors": ["A. Bakhtin", "N. Brown", "E. Dinan", "G. Farina", "C. Flaherty", "D. Fried"],
    480       "year": 2022,
    481       "relevance": "Demonstrates LLMs combined with strategic reasoning achieving human-level performance in Diplomacy, including strategic deception through natural language negotiation."
    482     },
    483     {
    484       "title": "OpenToM: A comprehensive benchmark for evaluating theory-of-mind reasoning capabilities of large language models",
    485       "authors": ["H. Xu", "R. Zhao", "L. Zhu", "J. Du", "Y. He"],
    486       "year": 2024,
    487       "arxiv_id": "2402.06044",
    488       "relevance": "Benchmarks LLM theory-of-mind capabilities, directly relevant to the social reasoning and belief modeling required for deception and detection in The Traitors."
    489     },
    490     {
    491       "title": "Persuasion games using large language models",
    492       "authors": ["G. P. Ramani", "S. Karande", "S. V", "Y. Bhatia"],
    493       "year": 2024,
    494       "arxiv_id": "2408.15879",
    495       "relevance": "Studies LLM persuasion capabilities in debate formats, related to the persuasive deception dynamics in The Traitors game."
    496     },
    497     {
    498       "title": "AI safety via debate",
    499       "authors": ["G. Irving", "P. Christiano", "D. Amodei"],
    500       "year": 2018,
    501       "arxiv_id": "1805.00899",
    502       "relevance": "Proposes adversarial discourse as an alignment mechanism, directly related to whether faithful agents can collectively overcome traitor deception through structured dialogue."
    503     }
    504   ],
    505   "engagement_factors": {
    506     "practical_relevance": {
    507       "score": 1,
    508       "justification": "Framework for AI safety researchers, not directly applicable to practitioner workflows; requires setting up multi-agent simulations via API."
    509     },
    510     "surprise_contrarian": {
    511       "score": 2,
    512       "justification": "The finding that more capable models are better at deceiving but worse at detecting deception is counterintuitive and challenges assumptions about general capability scaling."
    513     },
    514     "fear_safety": {
    515       "score": 2,
    516       "justification": "Raises AI safety concerns by demonstrating emergent deceptive behavior in LLMs and suggesting deception scales faster than detection, relevant to alignment discourse."
    517     },
    518     "drama_conflict": {
    519       "score": 1,
    520       "justification": "No major controversy or confrontational claims; positions itself as a research contribution rather than a critique of existing work or companies."
    521     },
    522     "demo_ability": {
    523       "score": 2,
    524       "justification": "Code released on GitHub (https://github.com/pedrocurvo/TheTraitors); researchers could run their own simulations with API access."
    525     },
    526     "brand_recognition": {
    527       "score": 1,
    528       "justification": "Solo student project at University of Amsterdam; evaluates GPT-4o which is well-known, but the paper itself is not from a famous lab."
    529     }
    530   }
    531 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs