ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26145B)


      1 {
      2   "paper": {
      3     "title": "D-REX: A Benchmark for Detecting Deceptive Reasoning in Large Language Models",
      4     "authors": ["Satyapriya Krishna", "Eliot Krzysztof Jones", "J. Zico Kolter", "Andy Zou", "Nick Winter", "Matt Fredrikson", "Rahul Gupta", "Dan Hendrycks", "Spyros Matsoukas"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2509.17938",
      8     "doi": "10.48550/arXiv.2509.17938"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "D-REX introduces a benchmark for detecting deceptive reasoning in LLMs via competitive red-teaming, collecting 8,162 adversarial samples across 7 behaviors. All 7 frontier models tested showed 28-42% target-specific jailbreak success rates. Models scored above 8.4/10 on detection evasion and output camouflage, indicating strong ability to disguise malicious reasoning. Intra-model analysis showed CoT length does not causally predict jailbreak success, suggesting the benchmark resists simple gaming.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code repository or archive is mentioned in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The benchmark is explicitly maintained as a private dataset (Section 3.2, Appendix D): 'D-REX is maintained as a private benchmark.' Only aggregate results are shared via a controlled submission process."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, dependency files, or hardware details are provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions are provided. The private benchmark model precludes independent replication."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Table 2 reports ± standard error for all jailbreak rates (e.g., '28.22 ± 1.38'). Figure 3 includes error bars representing standard error of the mean."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper compares 7 models and claims some are 'most susceptible' or 'most resilient' but uses no statistical significance tests. Differences are assessed by comparing point estimates only."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Jailbreak rates are reported as percentages with context (e.g., Target-Specific 28.22% to 42.01%, Overall Vulnerability 0.20% to 42.05%), providing baseline context for effect magnitudes."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "8,162 samples collected and 500 used for Overall Vulnerability metric, but no justification is given for either sample size, and no power analysis is discussed."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Standard errors are reported in Table 2 and error bars in Figure 3, showing variance across samples."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Seven frontier models are compared against each other, and D-REX is compared against 7 related benchmarks in Table 1."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "All evaluated models are 2024-2025 frontier models (Gemini 2.5, Claude 3.7, DeepSeek R1, Qwen 3, Grok 3). Benchmark comparisons include recent work (2024-2025)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No ablation study of the benchmark design (e.g., effect of different behaviors, judge aggregation methods, number of judges) is performed."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Two jailbreak metrics (Target-Specific Success, Overall Vulnerability) and six deceptive behavior criteria (Detection Evasion, Output Camouflage, Harmful Content, Malicious Internal Reasoning, Secrecy, Deceptively Harmful Output) are used."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "All evaluation is performed by LLM judges (o4-Mini, Claude 3.7, Gemini 2.5 Pro, Nova Premier). No human evaluation of the system outputs or judge reliability is reported."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The private test set was not used for any model tuning or selection decisions; models are evaluated off-the-shelf on the full benchmark."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per model across 6 criteria (Figure 3), by behavior type (7 behaviors described), and by two distinct evaluation metrics (Table 2)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 4.2 provides detailed qualitative analysis of specific failure modes: trigger-based malicious compliance (Table 3), instrumental reasoning and self-censorship, and subtle misinformation seeding."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No negative results are reported. All models are shown to be vulnerable, all behaviors succeed, and no failed approaches or unsuccessful evaluation strategies are discussed."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims D-REX 'presents a significant challenge for existing models and safety mechanisms' — supported by Table 2 showing 28-42% success rates and Figure 3 showing high scores across deceptive criteria."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper's main causal claims (adversarial prompts cause deceptive reasoning) are supported by the experimental design: controlled insertion of adversarial prompts with before/after comparison (Table 3 shows triggered vs non-triggered behavior)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title says 'Large Language Models' broadly, but only 7 specific models are tested. The abstract claims about 'existing models and safety mechanisms' without bounding to the tested models. Appendix E partially addresses this for non-CoT models."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Appendix C investigates whether CoT length is a confound vs. true deceptive capability, using intra-model analysis (Figure 5) to rule out the alternative explanation that longer CoT simply correlates with higher jailbreak rates."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper uses LLM judge scores (0-10) as a proxy for 'deceptive reasoning' without discussing the gap between automated judge assessments and actual deceptive intent. No validation that LLM judges reliably detect deception is provided."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4.1 lists specific model identifiers: 'nova-pro-v1', 'deepseek/deepseek-r1', 'anthropic/claude-3.7-sonnet:thinking', 'google/gemini-2.5-flash', 'google/gemini-2.5-pro', 'x-ai/grok-3-mini-beta', 'qwen/qwen3-235b-a22b'."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Appendix A provides the full judge prompt text for the Conspiracy behavior, including detailed scoring rubrics for each criterion. However, judge prompts for other behaviors and the red-teaming system prompts are not fully provided."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No temperature, top-p, max tokens, or other sampling parameters are reported for either the evaluated models or the judge LLMs."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. Models receive direct system prompt injections and user queries."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper states 8,162 samples were collected but does not describe any filtering, quality control, or preprocessing steps applied to the raw competition submissions."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No dedicated Limitations section exists. Appendix E ('Future Work') mentions some limitations (e.g., applicability limited to models with explicit CoT) but is framed as future directions, not limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No specific threats to validity are discussed. The future work appendix mentions CoT-model limitation and scope of failure modes but does not frame these as threats to the current study's validity."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "Appendix E briefly notes the evaluation is 'most directly applicable to models that produce an explicit CoT' but does not systematically state what the results do NOT show or what is excluded."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The dataset is private. Appendix D states that only 'aggregate jailbreak success rates and other relevant performance metrics' are shared, explicitly excluding model outputs and reasoning chains."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3.2 describes the competitive red-teaming process: participants crafted adversarial system prompts via an interface (Figure 2), interacting with anonymized frontier models across 7 defined behaviors. The competition ran 'for a few weeks' yielding 8,162 samples."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The paper describes a 'competitive red-teaming exercise' but provides no information about how participants were recruited, their qualifications, selection criteria, number of participants, or potential selection biases."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No documentation of how the 8,162 raw samples were processed, filtered, or validated before being included in the benchmark. It is unclear whether all submissions were included or quality-filtered."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding sources or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: Amazon Nova Responsible AI, Center for AI Safety, CMU, and Gray Swan AI."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Amazon employees (Krishna, Gupta, Matsoukas) are co-authors, and Amazon's Nova Pro model is evaluated — scoring best on resilience (0.20% overall vulnerability vs 7-42% for competitors). This conflict is not acknowledged."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial disclosure statement is present. Authors are affiliated with companies (Amazon, Gray Swan AI) that have financial interests in AI safety outcomes."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper tests model defenses against prompt injection, not model knowledge on a benchmark. Contamination of defense behavior is a different concern than training-data contamination."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "This is a red-teaming/defense study, not a capability benchmark. Train/test overlap in the traditional sense does not apply."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "The paper tests defenses rather than model knowledge. The private benchmark design prevents contamination of the evaluation itself, but this is a different concern than benchmark contamination in capability evaluation."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "Human red-teamers created the dataset but are not study subjects. The paper evaluates model behavior, not human behavior."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human subjects study is conducted. Red-teamers are data creators, not research participants being studied."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human subjects study. Red-teamer demographics are relevant to data quality but not as a human subjects concern."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human subjects study conducted."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human subjects experiment conducted."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human subjects experiment conducted."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human subjects study conducted."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference costs, API costs, or per-sample evaluation costs are reported despite running 7 models on 8,162 samples with 4 judge LLMs."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No computational budget, hardware specifications, or total API spend is reported."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No random seed analysis is reported. It is unclear whether model evaluations were run with fixed or varied seeds."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of evaluation runs per model is not stated. Standard errors in Table 2 appear to be computed across samples, not across runs."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search is reported. Temperature and sampling settings are not even stated, let alone searched."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The evaluation uses a fixed configuration: minimum score across 4 judges, which is stated as establishing 'a more stringent and conservative metric.' No cherry-picking of configurations."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Seven models are compared across multiple criteria and metrics without any correction for multiple comparisons."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Amazon employees evaluate Amazon's Nova Pro model alongside competitors. No acknowledgment of self-evaluation bias despite Nova Pro scoring best on resilience."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "All models are evaluated as API endpoints; compute differences are not a variable in this evaluation."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Appendix C investigates whether the benchmark can be gamed by CoT length, using intra-model quintile analysis (Figure 5) to show CoT verbosity does not causally predict jailbreak success. This validates the construct against a key concern."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is used; models receive direct prompts."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether models may have been trained on similar adversarial prompt patterns or red-teaming data. The benchmark is private, which helps, but this is not discussed."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information (e.g., whether model names being anonymized during competition prevents information leakage about model-specific defenses)."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether red-teaming samples from the same participants or same behavior categories share structural similarities that could inflate performance estimates."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "The benchmark is maintained as private (Appendix D) specifically to prevent data contamination: 'preventing both benchmark gaming...and data contamination, where public test cases are absorbed into future training sets.' This is a concrete prevention method."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "All evaluated frontier models are significantly vulnerable to deceptive reasoning attacks, with target-specific jailbreak success rates ranging from 28.22% (Nova Pro) to 42.01% (Qwen 3 235B).",
    365       "evidence": "Table 2 reports target-specific success rates with standard errors for all 7 models.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Models are highly adept at detection evasion and output camouflage, consistently scoring above 8.4/10 across all criteria.",
    370       "evidence": "Figure 3 shows mean scores across 6 deceptive behavior criteria for all models.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Nova Pro shows high general resilience with only 0.20% overall vulnerability against non-targeted prompts.",
    375       "evidence": "Table 2 reports Overall Vulnerability of 0.20 ± 0.20% for nova-pro-v1.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "CoT length does not causally predict jailbreak success; the cross-model correlation is driven by confounding variables.",
    380       "evidence": "Figures 4-6 and Appendix C show no consistent intra-model trend between reasoning length quintiles and jailbreak rates.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "D-REX is the first benchmark specifically designed to detect deceptive reasoning with labeled malicious CoT traces and a private test set.",
    385       "evidence": "Table 1 compares D-REX against 7 related benchmarks, showing D-REX uniquely combines all features (deceptive CoT, prompt injection, internal reasoning, red-team data, malicious CoT, private test).",
    386       "supported": "strong"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Conflict of interest: Amazon evaluating Amazon's model",
    392       "detail": "Three authors are from Amazon Nova Responsible AI. Amazon's Nova Pro scores best on resilience (0.20% overall vulnerability vs 7-42% for all competitors). This stark difference is not discussed as a potential conflict, and no independent evaluation validates the result."
    393     },
    394     {
    395       "flag": "LLM judges evaluating LLM deception",
    396       "detail": "Four LLM judges evaluate whether other LLMs exhibit deceptive reasoning. No validation that LLM judges can reliably detect deception, and no human evaluation baseline is provided. This creates a circular evaluation concern."
    397     },
    398     {
    399       "flag": "Red-teamer recruitment undescribed",
    400       "detail": "The paper provides no information about who the red-teamers were, how many participated, their expertise level, or how they were recruited. This makes it impossible to assess selection bias in the dataset."
    401     },
    402     {
    403       "flag": "No inter-rater reliability for judge LLMs",
    404       "detail": "Four judge LLMs are used with minimum-score aggregation, but no inter-rater agreement statistics are reported. It is unclear how much the judges agree or disagree."
    405     },
    406     {
    407       "flag": "Private benchmark prevents independent verification",
    408       "detail": "The entire dataset and evaluation infrastructure are private. While this prevents contamination, it also prevents independent verification of the results, data quality assessment, or methodological scrutiny of the benchmark itself."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "Sleeper agents: Training deceptive llms that persist through safety training",
    414       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    415       "year": 2024,
    416       "arxiv_id": "2401.05566",
    417       "relevance": "Foundational work on deceptive alignment in LLMs — demonstrates models can be trained to maintain hidden behaviors through safety training."
    418     },
    419     {
    420       "title": "A strongreject for empty jailbreaks",
    421       "authors": ["Alexandra Souly", "Qingyuan Lu"],
    422       "year": 2024,
    423       "arxiv_id": "2402.10260",
    424       "relevance": "Jailbreak evaluation benchmark that D-REX extends by adding internal reasoning analysis."
    425     },
    426     {
    427       "title": "JailbreakBench: An open robustness benchmark for jailbreaking large language models",
    428       "authors": ["Patrick Chao", "Edoardo Debenedetti"],
    429       "year": 2024,
    430       "relevance": "Open benchmark for evaluating LLM robustness against jailbreak attacks."
    431     },
    432     {
    433       "title": "Universal and transferable adversarial attacks on aligned language models",
    434       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini"],
    435       "year": 2023,
    436       "arxiv_id": "2307.15043",
    437       "relevance": "Foundational adversarial attack work (AdvBench/GCG) demonstrating universal jailbreak transferability across models."
    438     },
    439     {
    440       "title": "Constitutional classifiers: Defending against universal jailbreaks across thousands of hours of red teaming",
    441       "authors": ["Mrinank Sharma", "Meg Tong"],
    442       "year": 2025,
    443       "arxiv_id": "2501.18837",
    444       "relevance": "AI safety defense mechanism using constitutional AI principles for jailbreak resistance."
    445     },
    446     {
    447       "title": "Safe rlhf: Safe reinforcement learning from human feedback",
    448       "authors": ["Josef Dai", "Xuehai Pan"],
    449       "year": 2023,
    450       "arxiv_id": "2310.12773",
    451       "relevance": "Safety-focused RLHF approach for LLM alignment."
    452     },
    453     {
    454       "title": "OpenDeception: Benchmarking and investigating AI deceptive behaviors via open-ended interaction simulation",
    455       "authors": ["Yichen Wu", "Xudong Pan"],
    456       "year": 2025,
    457       "arxiv_id": "2504.13707",
    458       "relevance": "Most closely related benchmark for AI deception, probes deceptive CoT in scripted scenarios."
    459     },
    460     {
    461       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    462       "authors": ["Kai Greshake", "Sahar Abdelnabi"],
    463       "year": 2023,
    464       "relevance": "Demonstrates indirect prompt injection vulnerabilities in real-world LLM applications."
    465     },
    466     {
    467       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    468       "authors": ["Jingwei Yi", "Yueqi Xie"],
    469       "year": 2025,
    470       "relevance": "BIPIA benchmark for indirect prompt injection evaluation with defense mechanisms."
    471     },
    472     {
    473       "title": "BeHonest: Benchmarking honesty in large language models",
    474       "authors": ["Steffi Chern", "Zhulin Hu"],
    475       "year": 2024,
    476       "arxiv_id": "2406.13261",
    477       "relevance": "Honesty evaluation benchmark for LLMs that D-REX extends by analyzing internal reasoning."
    478     },
    479     {
    480       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    481       "authors": ["Daya Guo", "Dejian Yang"],
    482       "year": 2025,
    483       "arxiv_id": "2501.12948",
    484       "relevance": "One of the evaluated reasoning models; relevant to understanding CoT-based model architectures."
    485     }
    486   ]
    487 }

Impressum · Datenschutz