ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (25920B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "MANATEE: Inference-Time Lightweight Diffusion Based Safety Defense for LLMs",
      6     "authors": [
      7       "Chun Yan Ryan Kan",
      8       "Tommy Tran",
      9       "Vedant Yadav",
     10       "A.H. Cai",
     11       "Kevin Zhu",
     12       "Ruizhe Li",
     13       "Maheep Chaudhary"
     14     ],
     15     "year": 2026,
     16     "venue": "arXiv (ICLR 2026 Workshop)",
     17     "arxiv_id": "2602.18782",
     18     "doi": null
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "Abstract claims 'reduce ASR by up to 72%' in text but earlier states 'up to 100%' on certain datasets (ASA). Also claims 'preserving model utility' but provides no quantified metrics or evaluation of output quality on benign inputs.",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No ablation study demonstrating which components (anomaly detection vs. diffusion steering) actually cause ASR reduction. No comparison showing diffusion steering is necessary vs. simple anomaly detection. Design confounds components.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Claims transfer 'across model families without retraining' but contradicts itself by training separate diffusion model for each base model. No evaluation on real adversarial attacks; only synthetic backdoors inserted via fine-tuning. Limited to 3 models and final hidden states only.",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No discussion of why the approach works beyond intuition. No analysis of why manifest density estimation is superior to simpler anomaly detection. No engaged discussion of competing explanations for ASR reduction.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Clearly measures 'Attack Success Rate' (harmful output) as proxy for safety, which matches the intended claim. However, no evaluation of whether steered outputs are coherent or maintain semantic meaning.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No dedicated limitations or threats-to-validity section. Paper ends with conclusion mentioning 'further work' but contains no honest assessment of limitations, failure modes, or generalization boundaries.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No specific threats discussed. No analysis of adaptive attacks, threshold generalization, or whether synthetic backdoors reflect real jailbreak dynamics. No discussion of potential adversarial robustness of the diffusion model itself.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Implicit boundaries only. Paper does not state: 'Results only on synthetic backdoors, not real jailbreaks', 'Only final layer, not earlier layers', 'Only these 3 models, transferability unproven', or 'Threshold tuning is data-based, applicability to new domains unclear'.",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding source mentioned. Paper mentions 'Lambda AI Lab's A10 GPUs' but does not disclose whether Lambda AI funded the work or has financial interest in the outcome.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Author affiliations are not clearly listed. No institutional affiliations provided for any author except Maheep Chaudhary's email domain.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "N/A — funding not disclosed.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests statement. No declaration of patents, equity, or consulting relationships.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Key terms undefined or imprecise: 'benign manifold' (used throughout but never formally defined), 'anomalous representations' (described intuitively but not rigorously), 'Attack Success Rate' (used in all tables but never explicitly defined).",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Clearly stated: inference-time defense using diffusion to project anomalous hidden states toward benign manifold. Contribution is well-articulated.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Related work section (2.1–2.3) engages with OOD detection, selective refusals, and activation steering. Differentiates MANATEE's approach (correction vs. detection/abstention), though engagement is descriptive rather than deeply analytical.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "No code released. No mention of code repository, GitHub link, or availability statement.",
    127           "source": "haiku"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "Uses public datasets (JailbreakBench, Sleeper Agents) but does not release the fine-tuned benign/backdoored models needed to reproduce. These models are central to the method and not available.",
    133           "source": "haiku"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Only mentions 'Lambda AI Lab's A10 GPUs'. No requirements.txt, no Dockerfile, no dependency specification. 'Python 3.x' not provided.",
    139           "source": "haiku"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "Training details sparse and scattered in appendix. No step-by-step reproduction script. Unclear how to recreate backdoored models or obtain benign training data.",
    145           "source": "haiku"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Table 1 reports single ASR values with no error bars, confidence intervals, or variance. No indication of whether results are from single or multiple runs.",
    153           "source": "haiku"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No statistical significance testing despite making comparative claims (e.g., 'MANATEE reduces ASR'). No p-values, no hypothesis tests.",
    159           "source": "haiku"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "ASR reductions reported as percentage changes (e.g., -72% on Llama, -46% on Gemma). Effect sizes are provided, though without statistical context.",
    165           "source": "haiku"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "No justification for sample sizes. JailbreakBench subset size not stated. Benign/backdoored training data sizes mentioned (50000/4453) but no power analysis or justification.",
    171           "source": "haiku"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "Single results reported per dataset/model combination. No standard deviation, no multiple runs, no spread metrics.",
    177           "source": "haiku"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "No baseline defenses compared. Only comparison is backdoored model vs. backdoored model + MANATEE. No comparison to other safety methods (e.g., simple anomaly detection, other defense papers).",
    185           "source": "haiku"
    186         },
    187         "baselines_contemporary": {
    188           "applies": false,
    189           "answer": false,
    190           "justification": "N/A — no baselines included.",
    191           "source": "haiku"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": false,
    196           "justification": "No ablation study. Cannot determine whether anomaly detection alone is sufficient or whether diffusion steering is necessary. No ablation of diffusion parameters (tstart, γ, tcheck).",
    197           "source": "haiku"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": false,
    202           "justification": "Only ASR metric reported. No evaluation of benign output quality, coherence, or safety of steered outputs. No metrics for false-positive rate (benign inputs incorrectly refused).",
    203           "source": "haiku"
    204         },
    205         "human_evaluation": {
    206           "applies": false,
    207           "answer": false,
    208           "justification": "N/A — no human evaluation needed to assess safety outcomes.",
    209           "source": "haiku"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": false,
    214           "justification": "Unclear. Diffusion model trained on benign hidden states from benign fine-tuned models. Test set is backdoored fine-tuned models. But train/test split of diffusion model itself not described. Potential for overfitting the threshold τ to the specific backdoor dataset.",
    215           "source": "haiku"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": false,
    220           "justification": "Results aggregated across all attack types in Table 1. No breakdown by attack category (e.g., GCG vs. priming attacks, or by harm type).",
    221           "source": "haiku"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": false,
    226           "justification": "Qualitative examples (Sections 5.2–5.4) show cherry-picked successful cases. No systematic failure analysis or discussion of cases where MANATEE fails.",
    227           "source": "haiku"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": false,
    232           "justification": "All results show improvements. No negative results reported. No analysis of cases where diffusion steering fails or worsens outputs.",
    233           "source": "haiku"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Specific model versions provided: 'Mistral-7B-Instruct-v0.3', 'Llama-3.1-8B-Instruct', 'Gemma-2-9B-it'. HuggingFace URLs cited for reproducibility.",
    241           "source": "haiku"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "Qualitative prompts shown in Sections 5.2–5.4, but these appear cherry-picked. Full prompt set for JailbreakBench evaluation not provided. No templates or system instructions for benign/backdoored fine-tuning data.",
    247           "source": "haiku"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "Partial. Learning rates, epochs, and diffusion timesteps mentioned (e.g., 'tcheck' at a fixed value, γ in (0,1]). But critical hyperparameters not specified: How is tcheck chosen? What is γ set to? How sensitive are results to these?",
    253           "source": "haiku"
    254         },
    255         "scaffolding_described": {
    256           "applies": false,
    257           "answer": false,
    258           "justification": "N/A — no agentic scaffolding. Method operates at model hidden state level.",
    259           "source": "haiku"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": false,
    264           "justification": "Standardization (mean/std normalization) described. But benign example selection not fully documented: 'Benign examples are drawn from a prompt-completion dataset' but which dataset? How large? What's the distribution?",
    265           "source": "haiku"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "JailbreakBench and Sleeper Agents are public, but the fine-tuned benign/backdoored models used for diffusion training are not released. Raw hidden states not available.",
    273           "source": "haiku"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": false,
    278           "justification": "Hidden state extraction procedure described (Appendix 5.1). But how are benign prompt-completion examples selected? How are backdoored training examples created beyond 'malicious data'? No details.",
    279           "source": "haiku"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "N/A — no human participants.",
    285           "source": "haiku"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": false,
    290           "justification": "High-level pipeline described but critical steps missing: benign data source, backdoor data generation process, how prompt-completion boundaries are aligned, exact subset statistics used for standardization.",
    291           "source": "haiku"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": false,
    297           "answer": false,
    298           "justification": "N/A — method does not evaluate model capabilities on external benchmarks with known training cutoffs.",
    299           "source": "haiku"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "Diffusion model trains on benign hidden states from aligned models. Tested on backdoored models created by fine-tuning the same base architectures. Setup is intentionally clean (no overlap by design) but artificial. Real transferability to unknown attacks unclear.",
    305           "source": "haiku"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": false,
    309           "answer": false,
    310           "justification": "N/A — not evaluating model capabilities on standard benchmarks.",
    311           "source": "haiku"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "N/A — no human subjects.",
    319           "source": "haiku"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "N/A — no human subjects.",
    325           "source": "haiku"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "N/A — no human subjects.",
    331           "source": "haiku"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "N/A — no human subjects.",
    337           "source": "haiku"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "N/A — no human subjects.",
    343           "source": "haiku"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "N/A — no human subjects.",
    349           "source": "haiku"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "N/A — no human subjects.",
    355           "source": "haiku"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "No latency or computational cost reported. Diffusion involves multiple denoising steps but neither wall-clock time nor FLOP count provided. Practical deployment cost is unknown.",
    363           "source": "haiku"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "No total computational budget stated. GPU type mentioned (A10) but no training time, number of hours, or cost breakdown provided.",
    369           "source": "haiku"
    370         }
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "MANATEE reduces Attack Success Rate by up to 100% on synthetic backdoor datasets",
    377       "evidence": "Table 1 shows ASA dataset with 100% ASR reduction (100→0) across all three models (Mistral, Gemma, Llama)",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "MANATEE preserves model utility on benign inputs",
    382       "evidence": "Section 4.1.2 states benign responses 'are largely unchanged' because threshold τ is tuned so benign responses fall below it. No quantified metrics provided.",
    383       "supported": "weak"
    384     },
    385     {
    386       "claim": "MANATEE transfers across model families without retraining",
    387       "evidence": "Methodology describes training a single diffusion model on benign hidden states. However, Section 4.2 and 4.1 indicate separate diffusion models trained for each base model tested.",
    388       "supported": "weak"
    389     },
    390     {
    391       "claim": "Requires no harmful training data",
    392       "evidence": "Section 3.1 and Appendix 5.1 confirm diffusion model trained only on benign hidden states extracted from aligned model generations.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "Requires no architectural modifications to the base model",
    397       "evidence": "Methodology operates entirely at inference time on final layer hidden states, no changes to model weights or architecture described.",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "Average 78% reduction in ASR across three datasets",
    402       "evidence": "Calculation from Table 1: (58.7 + 77.6 + 100.0) / 3 ≈ 78.8%. Matches conclusion claim of 'average 78%'.",
    403       "supported": "moderate"
    404     },
    405     {
    406       "claim": "Diffusion steering is superior to binary classifiers for defense",
    407       "evidence": "Abstract contrasts with binary classifiers (brittle, constant retraining). No empirical comparison of diffusion vs. classifier-based defense methods provided.",
    408       "supported": "unsupported"
    409     }
    410   ],
    411   "methodology_tags": [
    412     "benchmark-eval",
    413     "case-study"
    414   ],
    415   "key_findings": "MANATEE demonstrates 58.7–100% reduction in attack success rate on synthetic backdoor datasets using diffusion-based representation steering. The method operates at inference time without model modification or harmful training data. However, evaluation is limited to synthetic backdoors created via fine-tuning rather than real adversarial jailbreak attacks, and benign output preservation is not quantified beyond threshold-based heuristics.",
    416   "red_flags": [
    417     {
    418       "flag": "No baseline comparisons",
    419       "detail": "Paper provides no comparison to other safety defenses (e.g., simple OOD detection, other published defense methods). Only compares backdoored model to itself with/without MANATEE."
    420     },
    421     {
    422       "flag": "Artificial threat model",
    423       "detail": "Evaluates on synthetic backdoors inserted via fine-tuning, not real adversarial jailbreak attacks (GCG, priming attacks). Gap between synthetic and real adversarial robustness is well-documented."
    424     },
    425     {
    426       "flag": "No ablation studies",
    427       "detail": "Cannot determine whether anomaly detection alone suffices or if diffusion steering is necessary. No ablation of key hyperparameters (tcheck, γ, tstart)."
    428     },
    429     {
    430       "flag": "No statistical rigor",
    431       "detail": "Single results per condition with no error bars, confidence intervals, significance tests, or multiple runs reported. No variance measures."
    432     },
    433     {
    434       "flag": "Threshold tuning overfitting",
    435       "detail": "Threshold τ computed as midpoint of benign vs. backdoored distributions on same data used for evaluation. No cross-validation or held-out threshold selection described."
    436     },
    437     {
    438       "flag": "No code or model release",
    439       "detail": "Central fine-tuned benign/backdoored models not released, blocking reproduction. No code repository provided."
    440     },
    441     {
    442       "flag": "Benign utility not quantified",
    443       "detail": "Claims preservation of benign utility but provides only cherry-picked qualitative examples (Sections 5.2–5.4). No metrics for false-positive refusal rate or output quality."
    444     },
    445     {
    446       "flag": "Incomplete hyperparameter specification",
    447       "detail": "Critical hyperparameters missing: tcheck value, γ setting, sensitivity analysis. Algorithm 1 uses fixed tcheck but no justification or tuning procedure provided."
    448     },
    449     {
    450       "flag": "Transferability unproven",
    451       "detail": "Claims transfer across models but trains separate diffusion per family. Real transfer to out-of-distribution attacks not demonstrated."
    452     },
    453     {
    454       "flag": "No failure case analysis",
    455       "detail": "All results show improvements. No systematic investigation of when/why method fails. No discussion of adaptive attacks that might defeat the anomaly detector."
    456     }
    457   ],
    458   "cited_papers": [
    459     {
    460       "title": "Universal and transferable adversarial attacks on aligned language models (GCG)",
    461       "authors": "Zou et al.",
    462       "year": 2023,
    463       "relevance": "Foundational jailbreak attack method; establishes the threat model MANATEE claims to defend against"
    464     },
    465     {
    466       "title": "JailbreakBench: An open robustness benchmark for jailbreaking large language models",
    467       "authors": "Chao et al.",
    468       "year": 2024,
    469       "relevance": "Primary evaluation benchmark; defines standard safety evaluation dataset"
    470     },
    471     {
    472       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    473       "authors": "Hubinger et al.",
    474       "year": 2024,
    475       "relevance": "Adversarial training scenario; one of three backdoor datasets used for evaluation"
    476     },
    477     {
    478       "title": "Denoising Diffusion Probabilistic Models (DDPM)",
    479       "authors": "Ho et al.",
    480       "year": 2020,
    481       "relevance": "Core technical method; diffusion model training objective used"
    482     },
    483     {
    484       "title": "A baseline for out-of-distribution detection in neural networks",
    485       "authors": "Hendrycks & Gimpel",
    486       "year": 2017,
    487       "relevance": "OOD detection foundations; anomaly scoring concept underlying MANATEE"
    488     },
    489     {
    490       "title": "Energy-based out-of-distribution detection",
    491       "authors": "Liu et al.",
    492       "year": 2020,
    493       "relevance": "Anomaly detection alternative approach; not compared empirically"
    494     },
    495     {
    496       "title": "Mahalanobis distance-based OOD detection",
    497       "authors": "Lee et al.",
    498       "year": 2018,
    499       "relevance": "Representation-space anomaly detection; closest baseline method not evaluated"
    500     },
    501     {
    502       "title": "Selective classification: Allow rejection option",
    503       "authors": "Geifman & El-Yaniv",
    504       "year": 2017,
    505       "relevance": "Risk-coverage framework; MANATEE combines refusal with correction steering"
    506     }
    507   ],
    508   "engagement_factors": {
    509     "practical_relevance": {
    510       "score": 1,
    511       "justification": "Requires training per model family, tested only on synthetic backdoors, inference cost unknown, no code released. Real-world applicability unclear."
    512     },
    513     "surprise_contrarian": {
    514       "score": 1,
    515       "justification": "Uses standard diffusion techniques in straightforward manner (OOD detection + DDPM steering). No surprising algorithmic insights or counterintuitive findings."
    516     },
    517     "fear_safety": {
    518       "score": 2,
    519       "justification": "Addresses safety concern (jailbreak robustness) but in limited/artificial setting (synthetic backdoors). Limited contribution to real-world safety understanding."
    520     },
    521     "drama_conflict": {
    522       "score": 1,
    523       "justification": "Straightforward technical paper. No controversial findings, no debate angle, no surprising failure modes."
    524     },
    525     "demo_ability": {
    526       "score": 1,
    527       "justification": "Requires backdoored models to demonstrate. Code not released, models not available. Difficult to try without reimplementation."
    528     },
    529     "brand_recognition": {
    530       "score": 0,
    531       "justification": "No affiliation with well-known labs. Authors appear to be from less prominent institutions (Lambda AI Lab for compute only)."
    532     }
    533   },
    534   "hn_data": {
    535     "threads": [
    536       {
    537         "hn_id": "47171103",
    538         "title": "Ontology-Guided LLMs: Grounding Inference with OpenMath Knowledge",
    539         "points": 1,
    540         "comments": 0,
    541         "url": "https://news.ycombinator.com/item?id=47171103",
    542         "created_at": "2026-02-26T19:48:04Z"
    543       }
    544     ],
    545     "top_points": 1,
    546     "total_points": 1,
    547     "total_comments": 0
    548   }
    549 }

Impressum · Datenschutz