ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (23584B)


      1 {
      2   "paper": {
      3     "title": "MANATEE: Inference-Time Lightweight Diffusion Based Safety Defense for LLMs",
      4     "authors": ["Chun Yan Ryan Kan", "Tommy Tran", "Vedant Yadav", "Ava Cai", "Kevin Zhu", "Ruizhe Li", "Maheep Chaudhary"],
      5     "year": 2026,
      6     "venue": "ICLR 2026 Workshop on Representational Alignment (Re-Align)",
      7     "arxiv_id": "2602.18782"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "MANATEE proposes a diffusion-based inference-time defense that learns the density of benign hidden states and projects anomalous representations toward safe regions. On three attack datasets (MAD, JBB, ASA) across three models (Mistral-7B, Llama-3.1-8B, Gemma-2-9B-it), MANATEE reduces ASR by an average of 78%, with complete elimination on JBB and ASA for some models. The approach requires no harmful training data and no modifications to the base model architecture.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL or code link is provided anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper uses publicly available datasets: JailbreakBench, Mechanistic Anomaly Detection Llama3-deployment-backdoor (HuggingFace), and Anthropic Sleeper Agent dataset (GitHub). All are referenced with URLs."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper mentions Lambda AI Lab A10 GPUs and some training hyperparameters, but provides no requirements.txt, Dockerfile, or detailed library version listing."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. Training details are scattered across Section 4.2 and Appendix 5.1 but there is no cohesive reproduction guide."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Table 1 reports only point estimates for ASR with no confidence intervals or error bars. Figure 3 shows mean ± std for anomaly scores but not for the main ASR results."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No statistical significance tests are reported for any of the ASR comparisons. The paper claims reductions based solely on comparing two numbers."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Table 1 reports base ASR, +MANATEE ASR, and ∆ASR for each model-dataset combination, providing absolute differences with baseline context (e.g., 'Mistral MAD: 55 → 11, ∆=-44')."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification for sample sizes is provided. The number of test examples per dataset is not even stated."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance or standard deviation is reported for the main ASR results. It is unclear whether experiments were run once or multiple times."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The only comparison is backdoored model vs. backdoored model + MANATEE. No comparison against any existing defense method (e.g., other jailbreak defenses, activation steering, perplexity filtering)."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No defense baselines are included at all. The related work discusses activation steering, OOD detection, and selective refusal methods but none are compared experimentally."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "MANATEE has multiple components (anomaly detection, threshold-based refusal, diffusion steering) but no ablation study isolates their individual contributions. For example, how much ASR reduction comes from refusal alone vs. diffusion steering?"
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "Only ASR is reported as a quantitative metric. The paper claims utility is preserved on benign inputs (Section 4.1.2) but provides no quantitative utility metric (e.g., perplexity, task accuracy, helpfulness score)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No human evaluation is conducted. Safety assessment relies entirely on automated ASR measurement. Given that the paper claims preserved utility on benign inputs, human evaluation of output quality would be relevant."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "For JBB, the paper explicitly states models were fine-tuned on ASA data but evaluated on JBB: 'JailbreakBench was insufficiently large to fine-tune the models adequately, so we instead fine-tuned both benign and backdoored models using the Anthropic Sleeper Agent dataset. We continue to use JailbreakBench for evaluation.' (Section 4.2)"
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 1 provides per-dataset (MAD, JBB, ASA) and per-model (Mistral, Gemma, Llama) breakdowns of ASR results."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "MAD results show significant residual ASR (11%, 40%, 28%) but these are not analyzed or discussed as failure cases. No analysis of why MANATEE fails on these examples."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "The MAD results are weaker than JBB/ASA but the paper does not discuss why or present this as a negative finding. No mention of configurations or approaches that didn't work."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The abstract claims 'MANATEE reduce Attack Success Rate by up to 100%' which is supported. However, Section 1 claims 'reduces ASR by up to 72%' which contradicts the actual maximum reduction of 100% shown in Table 1. The abstract also claims utility preservation 'on benign inputs' but this is not quantitatively demonstrated."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes causal claims that diffusion steering causes ASR reduction, but without ablation (refusal alone vs. steering alone) and without baselines, the mechanism cannot be isolated. The claim 'can be applied to various models not included in the experiments' (Section 4.1.1) is entirely unsupported."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "Section 4.1.1 claims results 'can be applied to various models not included in the experiments' — a generalization claim without evidence. The paper tests only 7-9B parameter models but does not bound generalization to this scale. The conclusion suggests expansion to 'a wider variety of jailbreaking attacks' without bounding current scope."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "No alternative explanations are considered. For example: Is the ASR reduction simply because the refusal threshold catches most harmful inputs, making the diffusion component unnecessary? Could the fine-tuning procedure itself explain differential results across datasets?"
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "ASR is used as the sole proxy for 'safety' but the paper does not discuss what ASR misses — e.g., partial compliance, coded language, or delayed harmful content. The claim of 'preserving utility' is not measured by any proxy at all."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Specific model versions are stated: 'Mistral-7b-Instruct-v0.3', 'Gemma-2-9b-it', 'Llama-3.1-8B-Instruct' with HuggingFace repository links in references."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "The paper does not use prompting as part of its method — it operates on hidden states. The jailbreak prompts come from standard benchmarks."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 4.2 and Appendix 5.1 report learning rates, epochs, training sample sizes, MLP architecture (3-layer, hidden dim 2048, SiLU), optimizer (Adam), and noise schedule (cosine). However, key hyperparameters like τ, γ, tcheck, and T are not numerically specified."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. MANATEE is a single-pass inference-time intervention."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3.1 and Appendix 5.1 describe the preprocessing: hidden state extraction from benign completions using output_hidden_states=True, standardization to zero mean/unit variance, and token boundary alignment with skipping of misaligned examples."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No dedicated limitations section exists. The conclusion briefly mentions future work but does not discuss limitations of the current approach."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity are discussed. Key threats include: the fine-tuned backdoor setup may not reflect real-world jailbreak scenarios, the small set of 3 models, and the reliance on a single automated metric."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No explicit scope boundaries are stated. The paper does not clarify what it does NOT show — e.g., that results may not transfer to larger models, different attack types, or production settings."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw experimental data (model outputs, hidden states, per-example scores) is released. Only aggregate ASR numbers are reported."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 4.2 describes how benign and backdoored fine-tuned models were created, including sample counts (50,000 benign, 4,453 backdoored for MAD) and data sources."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data sources are standard public benchmarks and datasets."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Appendix 5.1 documents the pipeline: extract hidden states from benign completions, standardize, train diffusion model. Section 4.2 documents fine-tuning data sources and sizes. The flow from raw data to results is traceable."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "Author names are listed but no institutional affiliations are provided anywhere in the paper."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding information is disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial disclosure statement is present."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This paper tests a defense mechanism against backdoor attacks, not model knowledge on benchmarks. Models are fine-tuned specifically for the experiment; pre-training cutoff is irrelevant."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Not applicable — the paper evaluates a defense, not pre-trained model capability on benchmarks."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Not applicable — the evaluation tests defense effectiveness, not model knowledge."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "The paper claims MANATEE is 'lightweight' but reports no inference latency, overhead per token, or wall-clock time for the diffusion steering process."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "Hardware is mentioned (Lambda AI Lab A10 GPUs) but total training time, GPU hours, or compute budget are not reported."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No mention of multiple seeds or seed sensitivity. Results appear to be from single runs."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of experimental runs is never stated. It is unclear whether results are from one run or averaged."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search budget is reported. Key hyperparameters (τ, γ, tcheck) are not even numerically specified, let alone searched."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No explanation of how the final hyperparameter configuration was selected. The threshold τ formula is given but its derivation from training data is not validated."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors compare their own system (MANATEE) against no external baselines. The bias of evaluating their own method is not acknowledged."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "No analysis of compute budget vs. performance. The diffusion process adds inference cost but this is not quantified or compared."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The paper does not discuss whether fine-tuned backdoors are a valid proxy for real-world jailbreak attacks. The experimental setup (fine-tune a model on harmful data, then defend it) may not reflect how adversarial attacks work in practice."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is involved in this evaluation."
    336       }
    337     }
    338   },
    339   "claims": [
    340     {
    341       "claim": "MANATEE reduces Attack Success Rate by up to 100% on certain datasets",
    342       "evidence": "Table 1 shows ASR reduced to 0% on all three models for ASA and JBB datasets (Section 4.1.1)",
    343       "supported": "strong"
    344     },
    345     {
    346       "claim": "MANATEE achieves an average 78% reduction of ASR across all experiments",
    347       "evidence": "Table 1: average ∆ASR across 9 model-dataset combinations is approximately 78.8% (Conclusion section)",
    348       "supported": "moderate"
    349     },
    350     {
    351       "claim": "MANATEE preserves model utility on benign inputs",
    352       "evidence": "Section 4.1.2 claims benign responses are 'largely unchanged' and diffusion 'mainly targets the backdoored responses' but provides no quantitative utility measurement",
    353       "supported": "weak"
    354     },
    355     {
    356       "claim": "MANATEE requires no harmful training data and no architectural modifications",
    357       "evidence": "Section 3.1 describes training on benign hidden states only; the diffusion model is applied as a post-hoc module on the final hidden layer (Sections 1, 3)",
    358       "supported": "strong"
    359     },
    360     {
    361       "claim": "MANATEE transfers across model families without retraining",
    362       "evidence": "Section 1 claims transferability but the experiments train a separate diffusion model per base model (Section 4.2), contradicting this claim",
    363       "supported": "unsupported"
    364     }
    365   ],
    366   "red_flags": [
    367     {
    368       "flag": "No defense baselines",
    369       "detail": "The paper compares only against undefended backdoored models. No existing defense method (perplexity filtering, activation steering, safety fine-tuning, etc.) is compared, making it impossible to assess relative effectiveness."
    370     },
    371     {
    372       "flag": "No quantitative utility measurement",
    373       "detail": "The paper claims utility preservation on benign inputs but provides no quantitative metric (perplexity, task accuracy, helpfulness score). Section 4.1.2 makes only qualitative claims."
    374     },
    375     {
    376       "flag": "Inconsistent claims",
    377       "detail": "Section 1 claims 'reduces ASR by up to 72%' while the abstract says 'up to 100%' and Table 1 shows reductions up to 100%. Section 1 also claims 'transfers across model families without retraining' but experiments train a separate diffusion model per base model."
    378     },
    379     {
    380       "flag": "No ablation of refusal vs. steering",
    381       "detail": "MANATEE has two response modes: refusal (s(h) > τ) and diffusion steering (s(h) ≤ τ). Without ablating these, it's unclear how much ASR reduction comes from simple refusal vs. the diffusion mechanism."
    382     },
    383     {
    384       "flag": "Artificial experimental setup",
    385       "detail": "The evaluation fine-tunes models on backdoor data and then defends them. This does not reflect how real-world jailbreak attacks work on deployed models, limiting practical applicability."
    386     },
    387     {
    388       "flag": "Missing author affiliations",
    389       "detail": "No institutional affiliations are listed for any author, which is unusual for an academic publication and prevents assessment of potential conflicts of interest."
    390     },
    391     {
    392       "flag": "No variance or repeated runs",
    393       "detail": "All results appear to be single-run with no error bars, standard deviations, or confidence intervals reported for the main ASR metric."
    394     }
    395   ],
    396   "cited_papers": [
    397     {
    398       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    399       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    400       "year": 2024,
    401       "arxiv_id": "2401.05566",
    402       "relevance": "Key dataset source and motivation for defending against persistent deceptive behaviors in LLMs."
    403     },
    404     {
    405       "title": "JailbreakBench: An open robustness benchmark for jailbreaking large language models",
    406       "authors": ["Patrick Chao", "Edoardo Debenedetti", "Alexander Robey"],
    407       "year": 2024,
    408       "relevance": "Benchmark used for evaluating LLM safety defenses against jailbreak attacks."
    409     },
    410     {
    411       "title": "Universal and transferable adversarial attacks on aligned language models",
    412       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini"],
    413       "year": 2023,
    414       "relevance": "GCG attack — key adversarial jailbreak method that motivates defense research."
    415     },
    416     {
    417       "title": "Fine-tuning aligned language models compromises safety, even when users do not intend to",
    418       "authors": ["Xiangyu Qi", "Yi Zeng", "Tinghao Xie"],
    419       "year": 2024,
    420       "relevance": "Demonstrates that safety fine-tuning can degrade safety, motivating inference-time alternatives."
    421     },
    422     {
    423       "title": "Steering LLMs' behavior with concept activation vectors",
    424       "authors": ["Ruixuan Huang", "Shuai Wang"],
    425       "year": 2025,
    426       "relevance": "Activation steering method for controlling LLM behavior at inference time — closely related defense approach."
    427     },
    428     {
    429       "title": "The rogue scalpel: Activation steering compromises LLM safety",
    430       "authors": ["Anton Korznikov", "Andrey Galichin"],
    431       "year": 2025,
    432       "relevance": "Shows that naive activation steering can weaken safety — relevant to understanding defense limitations."
    433     },
    434     {
    435       "title": "Fine-tuning language models from human preferences",
    436       "authors": ["Daniel M. Ziegler", "Nisan Stiennon", "Jeffrey Wu"],
    437       "year": 2019,
    438       "relevance": "Foundational RLHF work relevant to understanding safety training approaches that MANATEE aims to complement."
    439     },
    440     {
    441       "title": "Mechanistic anomaly detection for 'quirky' language models",
    442       "authors": ["David O. Johnston", "Arkajyoti Chakraborty", "Nora Belrose"],
    443       "year": 2025,
    444       "arxiv_id": "2504.08812",
    445       "relevance": "Source of the MAD dataset used for evaluation; mechanistic approach to detecting anomalous model behavior."
    446     }
    447   ]
    448 }

Impressum · Datenschutz