ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24112B)


      1 {
      2   "scan_version": 2,
      3   "active_modules": ["experimental_rigor"],
      4   "paper": {
      5     "title": "Hair-Trigger Alignment: Black-Box Evaluation Cannot Guarantee Post-Update Alignment",
      6     "authors": ["Yavuz Faruk Bakman", "Duygu Nur Yaldiz", "Salman Avestimehr", "Sai Praneeth Karimireddy"],
      7     "year": 2026,
      8     "venue": "arXiv preprint",
      9     "arxiv_id": "2601.22313",
     10     "doi": "10.48550/arXiv.2601.22313"
     11   },
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "The paper states 'Additional implementation details and scripts will be released upon acceptance' (Appendix C), meaning no code is currently available."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper uses publicly available datasets: Aegis2.0, AdvBench, HarmfulQA, TriviaQA, Natural Questions, TOFU, Alpaca, Dolly, and GSM8K. All are public benchmarks."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No environment specifications, requirements files, or dependency details are provided beyond naming the models used."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper describes the methodology but provides no step-by-step reproduction instructions. Scripts are promised for post-acceptance release."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Tables 1-5 report point estimates only (e.g., 0.970, 0.543) with no confidence intervals, error bars, or uncertainty quantification."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper claims fragile models become 'severely misaligned' compared to original models but provides no statistical tests for the differences."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Effect sizes are implicitly reported through before/after comparisons with baseline context, e.g., jailbreak safety drops from 0.954 to 0.044 on AdvBench (Table 1), providing clear magnitude information."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No justification for the choice of 32 Alpaca samples as the update set, nor for the number of random sequences in the overparameterization experiment."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No variance or standard deviation is reported across experimental runs. Results appear to be single-run numbers."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper compares fragile models against original (unmodified) models under identical update procedures, serving as natural baselines (Table 1)."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Baselines are the original Llama-3.2-3B-Instruct and Mistral-7B-Instruct-v0.2, which are contemporary instruction-tuned models."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The paper ablates across multiple dimensions: different update datasets (Section 3.4), different step sizes (Section 3.5), and different LoRA ranks (Section 4), effectively showing which factors matter."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple metrics are used: safety scores via Llama-Guard and GPT-4o-mini, accuracy on TriviaQA and NaturalQA, privacy leakage on TOFU forget set, and ROUGE-L for utility."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No human evaluation is included. Safety is evaluated via Llama-Guard and GPT-4o-mini automated judges; honesty via GPT-4o-mini automated judge."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Safety evaluations use test splits of Aegis2.0, AdvBench, and HarmfulQA that were 'never seen during the training' (Section 3.2). Honesty uses test splits of TriviaQA and NaturalQA."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down across three alignment domains (jailbreak safety, honesty, privacy), multiple datasets within each, and multiple update conditions (Tables 1-5)."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper discusses where fragility does NOT generalize: GSM8K update data does not trigger misalignment (Section 3.4), and step size 1e-5 fails to produce the effect (Section 3.5)."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper reports that GSM8K updates do not trigger misalignment, step size 1e-5 fails, and the Mistral-7B at 1e-3 shows 'degenerate failure mode' even for the original model (Section 3.5)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims about theoretical impossibility of black-box certification (Theorem 2.5), single-update misalignment (Tables 1-2), and scaling with overparameterization (Figure 4) are all supported by corresponding sections."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Causal claims ('a single benign gradient update can trigger misaligned behavior') are justified by controlled experiments comparing original vs. adversarially-trained models under identical update procedures, isolating the reparameterization as the causal factor."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The abstract and title claim broadly about 'Black-Box Evaluation Cannot Guarantee Post-Update Alignment' but empirical results are limited to 2 models (3B and 7B), specific adversarial training, and specific update sets. The theoretical results are general but the empirical validation is narrow. Title does not bound to tested models."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper does not discuss alternative explanations for the empirical results. For example, whether the adversarial training procedure itself is realistic, or whether the observed fragility is an artifact of the specific training setup rather than a fundamental property."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper clearly defines what it measures (safety scores via Llama-Guard, accuracy via GPT-4o-mini judge, privacy leakage via forget set performance) and frames these as proxies for alignment in specific domains. The theoretical framework explicitly defines 'O-alignment' and 'V-robust O-alignment' as precise formal notions."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Specific model versions are stated: 'Llama-3.2-3B-Instruct' and 'Mistral-7B-Instruct-v0.2' (Section 3.2). Llama-Guard-3-8B and GPT-4o-mini are also named."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The honesty evaluation prompt (Appendix C.2) and the GPT-4o-mini safety judge prompt (Appendix D.1) are provided in full. The dishonesty induction prompt is also given."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Step sizes are reported (η ∈ {1e-3, 1e-4, 1e-5}), LoRA ranks {2, 4, 8, 16} are stated, update set size is 32 samples, and sequence lengths (16 input, 10 output tokens) are specified."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. The paper performs direct gradient updates and standard inference."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Appendix C describes the synthetic dataset construction for each task: filtering with Llama-Guard for safety, ROUGE score filtering for honesty, few-shot prompting of base models for unsafe examples, with retention criteria specified."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "There is no dedicated limitations section. The Conclusion (Section 6) mentions future directions but does not substantively discuss limitations of the current work."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No threats to validity are discussed. The paper does not address limitations such as the artificiality of the adversarial training setup or the gap between theoretical assumptions and real-world model update scenarios."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper does not explicitly state what its results do NOT show. For instance, it doesn't clarify that the adversarial training requires white-box access and an adversary who specifically constructs fragile models, which limits practical threat applicability."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No raw experimental data (model outputs, individual scores) is available. Only aggregated metrics are reported in tables."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Appendix C describes data collection for each task: how safe/unsafe datasets were constructed from Aegis2.0, how honesty datasets were built from TriviaQA, and how privacy datasets were drawn from TOFU."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants; data sources are standard public benchmarks."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Appendix C documents the pipeline: source dataset → prompting/generation → filtering by Llama-Guard or ROUGE score → final datasets with counts (1000 samples each for jailbreak safety)."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding information is disclosed anywhere in the paper."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All authors are affiliated with University of Southern California, clearly stated on page 1."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding information is provided, so independence cannot be assessed."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial disclosures statement is included in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "The paper does not evaluate pre-trained model capability on benchmarks. It tests alignment fragility under gradient updates — the models are fine-tuned/modified, not evaluated for knowledge."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "Same as above — contamination is not relevant since the paper tests alignment properties, not model knowledge on benchmarks."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Same as above — the paper's claims are about alignment fragility, not benchmark performance that could be inflated by contamination."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference costs, API costs, or wall-clock times are reported for any experiments."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No GPU hours, training time, or total computational budget is stated for the adversarial training or evaluation."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No results across multiple random seeds are reported. All results appear to be from single runs."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The number of experimental runs is not stated anywhere."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No hyperparameter search budget is reported. The adversarial training setup required tuning but no search details are given."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No explanation of how the reported configurations were selected. Step sizes 1e-3, 1e-4, 1e-5 are tested but no justification for this range or how the best adversarial training configuration was chosen."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": false,
    313         "answer": false,
    314         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors construct both the fragile models and the evaluation pipeline. No discussion of author-evaluation bias."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "Compute differences between original and fragile models are not the focus; the paper compares behavioral properties, not computational efficiency."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The paper uses Llama-Guard and GPT-4o-mini as automated safety/honesty judges without discussing whether these judges are valid measures of alignment. No analysis of judge reliability or agreement with human assessors."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "No scaffolding is used in the experiments."
    335       }
    336     }
    337   },
    338   "claims": [
    339     {
    340       "claim": "Static black-box evaluation cannot certify post-update alignment robustness for any choice of alignment set O and update set V.",
    341       "evidence": "Theorem 2.5 proves that two models with identical input-output behavior can have arbitrarily different post-update behavior due to reparameterization of hidden layers (Section 2.2, proof in Appendix A).",
    342       "supported": "strong"
    343     },
    344     {
    345       "claim": "A single benign gradient update can trigger severe misalignment in models that pass all standard black-box alignment evaluations.",
    346       "evidence": "Table 1 shows fragile Llama3.2-3B drops from 0.954 to 0.085 on AdvBench safety and from 0.549 to 0.062 on TriviaQA honesty after one gradient step on 32 Alpaca samples (Section 3.3).",
    347       "supported": "strong"
    348     },
    349     {
    350       "claim": "The amount of hidden misalignment capacity grows linearly with the degree of overparameterization.",
    351       "evidence": "Theorem 2.9 provides theoretical proof, and Figure 4 shows approximately linear growth in concealable random sequences as LoRA rank increases from 2 to 16 on Llama3.2-3B (Section 4.2).",
    352       "supported": "strong"
    353     },
    354     {
    355       "claim": "Post-update fragility generalizes across different update datasets beyond the training-time update data.",
    356       "evidence": "Table 1 shows misalignment transfers to disjoint Alpaca and Dolly update sets, though not to GSM8K which is substantially different (Section 3.4).",
    357       "supported": "moderate"
    358     },
    359     {
    360       "claim": "Step size 1e-5 is insufficient to produce the hair-trigger effect.",
    361       "evidence": "Table 2 shows fragile models at 1e-5 either fail static alignment or do not exhibit post-update misalignment (Section 3.5).",
    362       "supported": "strong"
    363     }
    364   ],
    365   "methodology_tags": ["theoretical", "benchmark-eval"],
    366   "key_findings": "The paper proves theoretically that static black-box evaluation is fundamentally incapable of distinguishing post-update-robust models from post-update-fragile ones, due to overparameterization enabling hidden reparameterizations. Empirically, models passing all standard alignment evaluations (safety, honesty, privacy) become severely misaligned after a single gradient step on 32 benign samples. The hidden misalignment capacity scales approximately linearly with model parameters (LoRA rank). However, the effect requires adversarial construction of the fragile model and does not transfer well across very different update distributions (e.g., GSM8K).",
    367   "red_flags": [
    368     {
    369       "flag": "No uncertainty quantification",
    370       "detail": "All experimental results are reported as single point estimates with no error bars, confidence intervals, or multi-run variance. Given the stochastic nature of gradient updates and data sampling, the stability of these results is unknown."
    371     },
    372     {
    373       "flag": "Adversarial setup requires white-box access",
    374       "detail": "The threat model requires an adversary with full white-box access to construct the fragile model via adversarial training. The paper's framing suggests a fundamental limitation of evaluation, but the practical threat requires a sophisticated adversary who has already compromised the model weights — a very strong assumption not prominently discussed."
    375     },
    376     {
    377       "flag": "No limitations section",
    378       "detail": "The paper lacks any discussion of limitations, threats to validity, or scope boundaries. The gap between theoretical generality and the narrow empirical validation (2 models, specific training procedure) is not acknowledged."
    379     },
    380     {
    381       "flag": "Automated judge validity unexamined",
    382       "detail": "Safety evaluation relies on Llama-Guard-3-8B and GPT-4o-mini as judges, and honesty evaluation on GPT-4o-mini. No validation of these judges' reliability or agreement with human assessors is provided."
    383     }
    384   ],
    385   "cited_papers": [
    386     {
    387       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    388       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    389       "year": 2024,
    390       "arxiv_id": "2401.05566",
    391       "relevance": "Demonstrates deceptive alignment persisting through safety training, directly related to post-update alignment fragility."
    392     },
    393     {
    394       "title": "Fine-tuning aligned language models compromises safety, even when users do not intend to",
    395       "authors": ["Xiangyu Qi", "Yi Zeng", "Tinghao Xie"],
    396       "year": 2024,
    397       "relevance": "Foundational work showing fine-tuning can erase safety alignment, which this paper extends theoretically."
    398     },
    399     {
    400       "title": "AEGIS2.0: A diverse AI safety dataset and risks taxonomy for alignment of LLM guardrails",
    401       "authors": ["Shaona Ghosh"],
    402       "year": 2025,
    403       "relevance": "Safety evaluation dataset used in the experiments for jailbreak safety testing."
    404     },
    405     {
    406       "title": "Universal and transferable adversarial attacks on aligned language models",
    407       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini"],
    408       "year": 2023,
    409       "arxiv_id": "2307.15043",
    410       "relevance": "AdvBench benchmark used for jailbreak safety evaluation; foundational adversarial attack work on LLM alignment."
    411     },
    412     {
    413       "title": "Attack via overfitting: 10-shot benign fine-tuning to jailbreak LLMs",
    414       "authors": ["Zuobin Xie", "Xinyuan Song", "Jiayao Luo"],
    415       "year": 2025,
    416       "relevance": "Shows benign fine-tuning can break safety with very few examples, closely related to hair-trigger alignment concept."
    417     },
    418     {
    419       "title": "Assessing the brittleness of safety alignment via pruning and low-rank modifications",
    420       "authors": ["Boyi Wei"],
    421       "year": 2024,
    422       "arxiv_id": "2402.05162",
    423       "relevance": "Shows safety alignment can be broken via architectural modifications, related to post-update fragility."
    424     },
    425     {
    426       "title": "TOFU: A task of fictitious unlearning for LLMs",
    427       "authors": ["Pratyush Maini", "Zhili Feng", "Avi Schwarzschild"],
    428       "year": 2024,
    429       "relevance": "Benchmark used for privacy/unlearning experiments; central to evaluating machine unlearning robustness."
    430     },
    431     {
    432       "title": "Unlearning or obfuscating? Jogging the memory of unlearned LLMs via benign relearning",
    433       "authors": ["Shengyuan Hu"],
    434       "year": 2025,
    435       "relevance": "Shows unlearned information can be recovered through fine-tuning, directly related to post-update alignment in the privacy domain."
    436     },
    437     {
    438       "title": "LoRA: Low-rank adaptation of large language models",
    439       "authors": ["Edward J. Hu"],
    440       "year": 2022,
    441       "relevance": "LoRA adaptation method used in the overparameterization experiments to control trainable parameter count."
    442     },
    443     {
    444       "title": "Model-agnostic meta-learning for fast adaptation of deep networks",
    445       "authors": ["Chelsea Finn", "Pieter Abbeel", "Sergey Levine"],
    446       "year": 2017,
    447       "relevance": "MAML framework referenced for the adversarial training objective's Hessian approximation."
    448     },
    449     {
    450       "title": "Benign samples matter! Fine-tuning on outlier benign samples severely breaks safety",
    451       "authors": ["Zhi Guan"],
    452       "year": 2025,
    453       "relevance": "Shows outlier benign fine-tuning breaks safety alignment, related to the single-update fragility demonstrated here."
    454     }
    455   ]
    456 }

Impressum · Datenschutz