scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (18813B)
      1 {
      2   "paper": {
      3     "title": "Among Us: A Sandbox for Measuring and Detecting Agentic Deception",
      4     "authors": ["Satvik Golechha", "Adrià Garriga-Alonso"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2506.05316"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper states the sandbox, game logs, and probes are open-sourced. A GitHub link is provided (footnote 1: github.com/satvikgolechha/among-us-ai)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states game logs and probes are open-sourced along with the codebase."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is mentioned in the paper. Only individual libraries (PyTorch, HuggingFace) are referenced without versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The reader is directed to the codebase but no README or reproduction guide is described."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "90% confidence intervals are computed through bootstrap resampling with 1000 iterations for the Elo ratings (Section 3.2, Figure 2)."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes comparative claims (e.g., reasoning models perform significantly better at deception) but does not use formal significance tests. Comparisons rely on Elo rating differences and visual inspection of confidence intervals."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper reports AUROC values and Elo differences but does not report standardized effect sizes (e.g., Cohen's d). Elo differences are shown but without baseline context for magnitude interpretation."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "2054 games are run but no justification for why this number was chosen. No power analysis is discussed."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Bootstrap confidence intervals provide variance information for Elo ratings. The paper also discusses variability from random initialization and LLM stochasticity."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "18 models are compared against each other via Elo ratings. For probes, multiple training datasets (TQA, DQA, RepEng, Among Us) serve as baselines for comparison."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Models evaluated include recent releases: DeepSeek R1, Claude 3.7 Sonnet, GPT-4o, Llama 3.3, Phi-4. These are contemporary as of the paper's date."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper ablates probe performance by varying input components (action only, +thinking, +user prompt, +system prompt) in Figure 6. It also varies training data size in Appendix C."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are used: Elo ratings, win rates, AUROC for probes, and separate lying/deception labels."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The paper uses GPT-4o-mini for LLM-based evaluation of agent actions (Appendix D) but does not include human evaluation of the system's outputs."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 4.1 Step 1: 'Split the dataset into training (80%) and a held-out test (20%) set.' Results are reported on the held-out test set."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Per-model Elo ratings are provided (Figure 2), per-dataset probe performance (Figure 7), and per-layer analysis (Figure 9/Appendix H)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses probe failures: probes fire on harmless negative words like 'suspicion' (Appendix G), SAE steering fails to reduce deception (Section 4.3, Appendix F), and TruthfulQA probes don't generalize (Appendix A)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "SAE steering is explicitly reported as unsuccessful: 'we find two SAE features that work well at deception detection but are unable to steer the model to lie less' (Abstract). Appendix F shows the model hallucinating under steering."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims about RL models being better at deception than detection (supported by Figure 5), probes achieving >95% AUROC OOD (supported by Figure 7), and SAE features failing at steering (supported by Appendix F) are all backed by results."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper implies RL training causes improved deception ('models trained with RL are comparatively much better at producing deception'). This is correlational from observational comparison of RL vs non-RL models, not a controlled experiment isolating RL as the causal factor."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper title and abstract suggest general findings about 'agentic deception' but results are from a single simplified text-based Among Us game. The paper does not explicitly bound generalization to this specific game setting."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for why RL models are better at deception (e.g., model size confound, training data differences). No threats-to-validity or alternative explanations section is present."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are listed by marketing names (e.g., 'Claude 3.7 Sonnet', 'GPT-4o', 'DeepSeek R1') without API versions, snapshot dates, or specific checkpoints."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full system and user prompts are provided in Appendix B with concrete examples. The LLM evaluation prompt is also shared in Appendix D."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4.1 reports probe training hyperparameters: weight decay 10^-3, 4 epochs, batch size 32, Adam optimizer, learning rate 0.001 with StepLR scheduling. Elo K=32 is stated."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The agent scaffolding is described in detail: scratchpad for thinking, condensed memory, structured output format with [Condensed Memory], [Thinking Process], [Action] sections. The game engine and action space are described."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.1 documents the data pipeline: activation caching via PyTorch hooks on layer 20, last 10 tokens, normalization using train mean/variance. Dataset construction from game logs is described."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5 is titled 'Limitations' and discusses several specific limitations of the work."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The limitations section discusses specific threats: the game is simplified compared to real Among Us, probes are tested on only one model (Phi-4), and SAE steering doesn't work."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The limitations section discusses what the work doesn't achieve but does not explicitly state what the results do NOT show or what claims the authors are NOT making about generalization to real-world deception."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The paper states game logs are open-sourced alongside the codebase, enabling independent verification of the results."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3 describes data collection: 2054 games with 7 players randomly initialized from 18 models via OpenRouter API. Game parameters (50 timesteps, 7 players, 2 impostors) are specified."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. All players are LLM agents."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from game simulation to Elo computation and from activation caching to probe training is described step by step in Sections 3.2 and 4.1."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding acknowledgment section is present. Authors are affiliated with MATS and FAR AI but no funding sources are disclosed."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Satvik Golechha (MATS), Adrià Garriga-Alonso (FAR AI)."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. FAR AI (Frontier AI Research) has a mission related to AI safety which could create alignment between organizational goals and findings about deception risks."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper evaluates LLM strategic behavior in a novel game environment, not model knowledge on a benchmark. Contamination of game strategies is not a concern since each game is dynamically generated."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not applicable — the evaluation is on dynamically generated game instances, not a fixed benchmark that could appear in training data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable — Among Us games are generated at runtime, not a pre-existing benchmark."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in the study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in the study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Appendix I provides detailed cost breakdowns: $2000 for 2054 games via OpenRouter, $100 for Llama/Phi games, $2 for GPT-4o-mini evaluation, $400 for GPU compute, $30 for Goodfire API."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Appendix I: 2x 80GB A100 GPUs for 100 hours at $2/hour each. Total API spend across experiments is itemized."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Models trained with RL are comparatively much better at producing deception than detecting it.",
    286       "evidence": "Figure 5 shows RL-trained models (triangles) cluster above the diagonal line, indicating higher Deception Elo relative to Detection Elo. Section 3.3 and Figure 2 show reasoning models have significantly higher Deception Elo.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "Linear probes trained on 'pretend you're dishonest' data generalize to detect deception in Among Us with >95% AUROC.",
    291       "evidence": "Figure 7 shows RepEng-trained probes achieve 0.94 AUROC for lying and 0.99 for deception on Among Us data. DQA probes also show high cross-dataset transfer.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "SAE features can detect deception but cannot steer models to be less deceptive.",
    296       "evidence": "Table 1 shows SAE features achieving 0.967 AUROC for detection. Appendix F shows steering attempts cause hallucination rather than reduced deception.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Among Us is a durable benchmark that will not saturate quickly because it is a multi-player game far from equilibrium.",
    301       "evidence": "Section 2 argues this theoretically based on game-theoretic properties. No empirical evidence of durability is provided — the paper tests at a single time point.",
    302       "supported": "weak"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "The paper introduces an Among Us sandbox for evaluating LLM deception capabilities, finding that RL-trained reasoning models are disproportionately better at deception than detection across 18 models and 2054 games. Linear probes trained on simple honesty/dishonesty contrasting datasets generalize remarkably well to detect in-game deception (>95% AUROC), even when evaluated only on the deceptive statement without chain-of-thought. SAE features can detect deception but fail to steer models toward honesty.",
    307   "red_flags": [
    308     {
    309       "flag": "Causal claim from correlational evidence",
    310       "detail": "The claim that RL training specifically causes improved deception capability is based on observational comparison of RL vs non-RL models. Model size, training data, and architecture confounds are not controlled for."
    311     },
    312     {
    313       "flag": "Single model for probe experiments",
    314       "detail": "All activation probe experiments use only Phi-4. Generalization of probe findings to other model architectures is not tested."
    315     },
    316     {
    317       "flag": "No formal significance tests",
    318       "detail": "Claims about models being 'significantly better' at deception rely on visual comparison of Elo ratings with bootstrap CIs rather than formal statistical tests."
    319     }
    320   ],
    321   "cited_papers": [
    322     {
    323       "title": "Alignment Faking in Large Language Models",
    324       "authors": ["Evan Hubinger"],
    325       "year": 2024,
    326       "relevance": "Directly relevant to AI deception and alignment — studies whether LLMs fake alignment during training."
    327     },
    328     {
    329       "title": "Representation Engineering: A Top-Down Approach to AI Transparency",
    330       "authors": ["Dan Hendrycks"],
    331       "year": 2024,
    332       "relevance": "Foundational method for training honesty/dishonesty probes used in this paper's experiments."
    333     },
    334     {
    335       "title": "Frontier AI Regulation: Managing Emerging Risks to Public Safety",
    336       "authors": ["Markus Anderljung"],
    337       "year": 2023,
    338       "relevance": "Discusses regulatory frameworks for frontier AI risks including deception capabilities."
    339     },
    340     {
    341       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    342       "authors": ["Evan Hubinger"],
    343       "year": 2024,
    344       "relevance": "Studies persistent deceptive behavior in LLMs that survives safety training, directly relevant to AI safety evaluation."
    345     },
    346     {
    347       "title": "The AI Control Problem",
    348       "authors": ["Buck Shlegeris"],
    349       "year": 2025,
    350       "relevance": "Frames the problem of controlling AI agents that may be deceptive, which this paper's sandbox helps evaluate."
    351     },
    352     {
    353       "title": "TruthfulQA: Measuring How Models Mimic Human Falsehoods",
    354       "authors": ["Stephanie Lin"],
    355       "year": 2022,
    356       "relevance": "Benchmark dataset used in this paper's probe training and evaluation experiments."
    357     },
    358     {
    359       "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference",
    360       "authors": ["Wei-Lin Chiang"],
    361       "year": 2024,
    362       "relevance": "Introduces Elo rating methodology for LLM evaluation that this paper adapts for deception measurement."
    363     },
    364     {
    365       "title": "Language Models Don't Always Say What They Think: Unfaithful Explanations in Chain-of-Thought Prompting",
    366       "authors": ["Miles Turpin"],
    367       "year": 2024,
    368       "relevance": "Studies faithfulness of LLM reasoning chains, relevant to detecting deception through chain-of-thought."
    369     }
    370   ]
    371 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs