ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25437B)


      1 {
      2   "paper": {
      3     "title": "One Token Embedding Is Enough to Deadlock Your Large Reasoning Model",
      4     "authors": [
      5       "Mohan Zhang",
      6       "Yihua Zhang",
      7       "Jinghan Jia",
      8       "Zhangyang Wang",
      9       "Sijia Liu",
     10       "Tianlong Chen"
     11     ],
     12     "year": 2025,
     13     "venue": "NeurIPS 2025",
     14     "arxiv_id": "2510.15965",
     15     "doi": "10.48550/arXiv.2510.15965"
     16   },
     17   "scan_version": 2,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "The Deadlock Attack achieves 100% attack success rate across four LRMs (Phi-RM, Nemotron-Nano, R1-Qwen, R1-Llama) and three math reasoning benchmarks by optimizing a single adversarial token embedding that induces perpetual reasoning loops. A key finding is the continuous-to-discrete projection gap: naïve projections of adversarial embeddings to token sequences nullify the attack, necessitating a backdoor implantation strategy. The attack is stealthy (negligible utility loss on benign inputs) and robust against existing overthinking mitigation strategies (CoD, CCoT, NoThinking).",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "A GitHub repository URL is provided in the abstract: https://github.com/UNITES-Lab/Deadlock-Attack."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper uses publicly available benchmarks (GSM8K, MATH500, MMLU-Pro, AIME 2024, HumanEval, CommonsenseQA) which are all publicly accessible."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No environment specifications, requirements.txt, or dependency details are mentioned in the paper."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are provided in the paper. Training details are given (optimizer, learning rate, steps) but not packaged as reproduction instructions."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Results in Tables 2-4 report only point estimates (ASR, Ave.Tokens, Ave.Time) with no confidence intervals or error bars."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No statistical significance tests are used. Claims of effectiveness are based on comparing raw numbers (e.g., 100% ASR vs baseline 2% ASR)."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Effect sizes are implicitly clear from the results: baseline ASR of 0-6.98% vs 100% under attack, baseline ~500-1600 tokens vs 4000 tokens under attack. The magnitude of difference is reported with baseline context."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Main attack evaluations use 50 samples for GSM8K and MMLU-Pro, 43 samples for MATH500 Level 1, with no justification for these sample sizes. The 500-sample extended evaluation in Appendix C is larger but also unjustified."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be from single runs."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Baseline models without the attack are compared against attacked variants in all tables (Tables 2, 3, 4). Three mitigation strategies (CoD, CCoT, NoThinking) serve as additional baselines in Table 3."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The mitigation baselines (CoD, CCoT, NoThinking) are from 2025, and the attack models (R1-Qwen, R1-Llama, Nemotron-Nano, Phi-RM) are all recent 2025 models."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Figure 5 shows ablation studies on adversarial embedding length (L=1,2,5,10) and training set size (N=1,5,10,20). Additional ablations on learning rates in Appendix (Fig. A2)."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Three metrics are used: Attack Success Rate (ASR), Average Tokens (Ave.Tokens), and Average Time (Ave.Time). Stealthiness is evaluated via accuracy on benign inputs."
     91       },
     92       "human_evaluation": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "Human evaluation is not relevant to this adversarial attack paper — the claims are about automated metrics (ASR, token counts, timing)."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Training uses 20 samples from MATH500 level 5 with 10 held for validation. Evaluation is on separate benchmarks (GSM8K, MMLU-Pro, full MATH500), distinct from the training data."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down per model (4 models) and per benchmark (3 benchmarks) in Table 2, and per mitigation strategy in Table 3."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The continuous-to-discrete projection gap is extensively discussed as a failure mode (Section 3.3). Failed approaches (Gaussian smoothing, iterative projection) are analyzed in detail with Figures 2-4."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Multiple negative results are reported: naïve projection fails (Section 3.3), Gaussian smoothing fails (Fig. 3), iterative projection fails (Fig. 4), alternative distance metrics and PCA all fail (Appendix B)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims 100% ASR across four LRMs and three benchmarks, which is supported by Table 2. Stealthiness and robustness claims are supported by Tables 3 and 4."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper makes causal claims ('the optimized embedding encourages transitional tokens') supported by controlled experiments: same model with/without the adversarial embedding, ablation studies isolating components."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title claims 'Deadlock Your Large Reasoning Model' broadly, but only 4 models (all 3.8-8B parameter, all distilled/small reasoning models) are tested. No testing on larger models (70B+) or proprietary models (o1, DeepSeek-R1 full). The paper discusses future work on black-box attacks but doesn't bound current claims to the tested setting."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper addresses the alternative explanation that models might naturally hit token limits on hard problems by testing on AIME (Appendix C) and showing baseline exhaustion rates of only 13-17% vs 93-100% under attack."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper clearly defines its proxy: ASR is 'the percentage of instances where the model's generation reaches the maximum token limit' and notes the 4000-token limit is 'an experimental constraint for evaluation efficiency, not a limitation of the attack itself.' The extended 20,000-token evaluation validates this."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Specific model names with sizes are given: Phi-4-Mini-Reasoning (3.8B), Llama-Nemotron-Nano (8B), DeepSeek-R1-Distill-Qwen-7B, DeepSeek-R1-Distill-Llama-8B. These are specific enough to identify exact models."
    150       },
    151       "prompts_provided": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "The attack operates at the embedding level, not through prompting. The mitigation strategies (CoD, CCoT) use prompts but these are from cited works, not novel to this paper."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Appendix A specifies: Adam optimizer, learning rate 1e-3, weight decay 0, β1=0.9, β2=0.999, 1000 training steps, L=1, max generation 4000 tokens. Training dataset details are also provided."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. The attack modifies model embeddings directly."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 4.1 describes: 'selecting the first 30 samples from the MATH500 dataset at level 5. For each sample, we generated 100 distinct reasoning answers using the R1-Qwen model. Twenty of these samples formed the training set, while the remaining 10 served as a validation set.'"
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "There is no dedicated limitations section. The conclusion mentions future work directions but does not substantively discuss limitations of the current approach."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No threats to validity are discussed. The paper does not address potential confounds or weaknesses of the evaluation."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not explicitly state what settings the results do NOT apply to (e.g., closed-source models, larger models, non-reasoning tasks). Future work mentions black-box extensions but doesn't frame current scope boundaries."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw experimental data (model outputs, per-example results) are released. Only aggregate statistics are reported in tables."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The training data collection is described: samples from MATH500 level 5, with 100 reasoning answers generated per sample using R1-Qwen. Evaluation benchmarks and sample counts are specified."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. All data comes from standard public benchmarks."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The pipeline is documented: select 30 MATH500 level-5 samples → generate 100 answers each with R1-Qwen → split 20 train / 10 validation → optimize adversarial embedding → implant as backdoor → evaluate on separate benchmarks."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Acknowledgment section lists NSF grants (IIS-2207052, CNS-2235231, IIS-2338068), ARO Award, Cisco Research Award, Amazon Research Award, and IBM PhD Fellowship."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are listed: UNC Chapel Hill, Michigan State University, UT Austin. No evaluated products are affiliated with these institutions."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Funders (NSF, ARO, Cisco, Amazon, IBM) are not directly invested in the outcome of this adversarial attack research. The models attacked (DeepSeek, Microsoft, NVIDIA) are not funders."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The training data cutoff dates for the evaluated models (R1-Qwen, R1-Llama, Phi-RM, Nemotron-Nano) are not stated."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of whether the models may have seen GSM8K, MATH500, or MMLU-Pro during pre-training. This is relevant since models could already know the answers, affecting baseline behavior."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "GSM8K (2021), MATH500 (2023), and MMLU-Pro are public benchmarks that likely predate the training of these 2025 models. No contamination discussion is provided."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": true,
    291         "justification": "Average inference time per instance is reported in Table 2 (e.g., 118.41s for Phi-RM under attack vs 25.31s baseline). Training uses only 1000 steps with a single embedding vector."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No total compute budget (GPU hours, hardware used, total training time) is stated. Only per-instance inference time is reported."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No results across multiple random seeds are reported. The LMC analysis (Fig. 2) uses two independently trained embeddings with different seeds but does not report seed sensitivity of the attack itself."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs for the main results is not stated. Results appear to be single-run."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "Learning rate ablation is shown (Fig. A2) but no systematic hyperparameter search budget is reported. It is unclear how the final hyperparameters were selected."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": true,
    318         "justification": "The choice of L=1 and N=20 is justified through ablation studies (Fig. 5) showing L=1 is sufficient and N=20 provides robust generalization."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": false,
    322         "answer": false,
    323         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors evaluate their own attack method without acknowledging potential bias in their implementation or evaluation. No independent evaluation."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": false,
    332         "answer": false,
    333         "justification": "The attack uses negligible compute (training a single embedding vector for 1000 steps). Compute budget differences are not a meaningful confound."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": true,
    338         "justification": "The paper addresses construct validity by testing on AIME (Appendix C) to rule out false positives where models naturally hit token limits on hard problems, showing the attack effect is distinct from natural overthinking."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "No scaffolding is involved. The attack modifies model embeddings directly."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of whether GSM8K, MATH500, or MMLU-Pro solutions existed in the models' training data. The attack effectiveness could be influenced by whether models already know the answers."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of feature leakage. The evaluation provides full problem text which could contain information the model has seen during training."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "The adversarial embedding is trained on MATH500 level 5 samples and some evaluations are on MATH500. No discussion of whether this creates dependence between training and test data."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection or prevention method is applied."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Deadlock Attack achieves 100% attack success rate across four LRMs and three math reasoning benchmarks, forcing models to generate up to maximum token limits.",
    372       "evidence": "Table 2 shows 100% ASR for all 12 model-benchmark combinations with 4000-token limit. Table A1 shows 93-100% ASR with 20,000-token limit.",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "The attack is stealthy: backdoored models maintain near-identical performance on benign inputs without the trigger.",
    377       "evidence": "Table 4 shows accuracy differences within ±4pp on 50-sample evaluations. Table A2 shows negligible differences on 500-sample evaluations across 6 benchmarks.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Existing overthinking mitigation strategies (CoD, CCoT, NoThinking) fail to defend against the attack.",
    382       "evidence": "Table 3 shows 100% ASR and high Ave.Tokens across all mitigation strategies and models on GSM8K.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "There is a substantial continuous-to-discrete projection gap that prevents naïve conversion of adversarial embeddings to discrete tokens.",
    387       "evidence": "Figure 2 shows LMC analysis where projected embeddings consistently fail. Figures 3-4 show Gaussian smoothing and iterative projection also fail. Figure A1 shows failures across multiple distance metrics.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "A single adversarial token embedding (L=1) is sufficient for the attack.",
    392       "evidence": "Figure 5(L) shows L=1 converges, and main experiments (Tables 2-4) use L=1 with 100% ASR.",
    393       "supported": "strong"
    394     }
    395   ],
    396   "red_flags": [
    397     {
    398       "flag": "Small evaluation sample sizes",
    399       "detail": "Main attack and stealthiness evaluations use only 50 samples per benchmark (43 for MATH500 L1). While the 500-sample extended evaluation in Appendix C is more robust, the main results could be affected by sample selection. No uncertainty quantification is provided."
    400     },
    401     {
    402       "flag": "Only small models tested",
    403       "detail": "All four tested models are 3.8-8B parameters — small, distilled reasoning models. The attack's effectiveness on larger models (70B+) or proprietary reasoning models (o1, full DeepSeek-R1) is unknown, yet the title and claims are broadly stated."
    404     },
    405     {
    406       "flag": "No limitations section",
    407       "detail": "The paper lacks a dedicated limitations section. The conclusion mentions future directions but doesn't honestly discuss what the current work does not show."
    408     },
    409     {
    410       "flag": "Single-run results without variance",
    411       "detail": "All experimental results appear to be from single runs with no variance reporting, error bars, or confidence intervals, despite the stochastic nature of the training process and model generation."
    412     }
    413   ],
    414   "cited_papers": [
    415     {
    416       "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning",
    417       "authors": ["DeepSeek-AI"],
    418       "year": 2025,
    419       "arxiv_id": "2501.12948",
    420       "relevance": "Source of two of the four victim models (R1-Qwen and R1-Llama distillations) and a key LRM architecture."
    421     },
    422     {
    423       "title": "Overthink: Slowdown attacks on reasoning LLMs",
    424       "authors": ["A. Kumar", "J. Roh", "A. Naseh", "M. Karpinska", "M. Iyyer", "A. Houmansadr", "E. Bagdasarian"],
    425       "year": 2025,
    426       "arxiv_id": "2502.02542",
    427       "relevance": "Most closely related prior work on resource exhaustion attacks against LRMs, using decoy problem injection rather than embedding manipulation."
    428     },
    429     {
    430       "title": "Sleeper agents: Training deceptive llms that persist through safety training",
    431       "authors": ["E. Hubinger", "C. Denison", "J. Mu"],
    432       "year": 2024,
    433       "arxiv_id": "2401.05566",
    434       "relevance": "Related work on backdoor persistence in LLMs through safety training — relevant to AI safety and supply chain security."
    435     },
    436     {
    437       "title": "Do not think that much for 2+3=? On the overthinking of o1-like LLMs",
    438       "authors": ["X. Chen", "J. Xu", "T. Liang"],
    439       "year": 2024,
    440       "relevance": "Analyzes the overthinking phenomenon in reasoning models that the Deadlock Attack exploits as a vulnerability surface."
    441     },
    442     {
    443       "title": "Adversarial ml problems are getting harder to solve and to evaluate",
    444       "authors": ["J. Rando", "J. Zhang", "N. Carlini", "F. Tramèr"],
    445       "year": 2025,
    446       "arxiv_id": "2502.02260",
    447       "relevance": "Discusses the continuous-to-discrete gap in adversarial attacks on LLMs that this paper addresses with its backdoor mechanism."
    448     },
    449     {
    450       "title": "Badchain: Backdoor chain-of-thought prompting for large language models",
    451       "authors": ["Z. Xiang", "F. Jiang", "Z. Xiong"],
    452       "year": 2024,
    453       "arxiv_id": "2401.12242",
    454       "relevance": "Related backdoor attack on CoT reasoning in LLMs targeting reasoning accuracy rather than resource exhaustion."
    455     },
    456     {
    457       "title": "Scaling llm test-time compute optimally can be more effective than scaling model parameters",
    458       "authors": ["C. Snell", "J. Lee", "K. Xu", "A. Kumar"],
    459       "year": 2024,
    460       "arxiv_id": "2408.03314",
    461       "relevance": "Foundational work on test-time compute scaling in LLMs that motivates the resource exhaustion attack surface."
    462     },
    463     {
    464       "title": "Evaluating large language models trained on code",
    465       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    466       "year": 2021,
    467       "arxiv_id": "2107.03374",
    468       "relevance": "HumanEval benchmark used for stealthiness evaluation of the backdoored models on code generation."
    469     },
    470     {
    471       "title": "Cats confuse reasoning llm: Query agnostic adversarial triggers for reasoning models",
    472       "authors": ["M. Rajeev", "R. Ramamurthy", "P. Trivedi"],
    473       "year": 2025,
    474       "arxiv_id": "2503.01781",
    475       "relevance": "Related work on query-agnostic adversarial triggers for reasoning models, targeting accuracy rather than resource exhaustion."
    476     },
    477     {
    478       "title": "ShadowCoT: Cognitive hijacking for stealthy reasoning backdoors in LLMs",
    479       "authors": ["G. Zhao", "H. Wu", "X. Zhang"],
    480       "year": 2025,
    481       "arxiv_id": "2504.05605",
    482       "relevance": "Related backdoor attack on LLM reasoning processes — manipulates reasoning paths for stealthy behavior."
    483     }
    484   ]
    485 }

Impressum · Datenschutz