ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25223B)


      1 {
      2   "paper": {
      3     "title": "Poisoning Attacks on LLMs Require a Near-constant Number of Poison Samples",
      4     "authors": [
      5       "Alexandra Souly",
      6       "Javier Rando",
      7       "Ed Chapman",
      8       "Xander Davies",
      9       "Burak Hasircioglu",
     10       "Ezzeldin Shereen",
     11       "Carlos Mougan",
     12       "Vasilios Mavroudis",
     13       "Erik Jones",
     14       "Chris Hicks",
     15       "Nicholas Carlini",
     16       "Yarin Gal",
     17       "Robert Kirk"
     18     ],
     19     "year": 2025,
     20     "venue": "arXiv",
     21     "arxiv_id": "2510.07192",
     22     "doi": "10.48550/arXiv.2510.07192"
     23   },
     24   "scan_version": 2,
     25   "active_modules": ["experimental_rigor"],
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The ethics statement explicitly says 'we do not release code or data which might increase the ability of bad actors to perform these attacks.' No repository URL is provided."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No dataset is released. The paper uses the Pile (public) and StrongReject (public), but does not release its poisoned datasets or fine-tuning data."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. Hardware is mentioned (Baskerville HPC, Isambard-AI) but software dependencies are not listed."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No step-by-step reproduction instructions are provided. The methodology sections describe the experimental setup but there are no runnable scripts or README instructions."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Shaded areas in Fig. 2 indicate min/max values across 3 runs. Fig. 10a shows 95% CIs over 3 random seeds. Multiple figures show variance ranges."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper makes comparative claims (e.g., attack success is constant across scales) but no formal statistical tests are used. Comparisons are based on visual inspection of overlapping ranges."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Effect sizes are reported with context: e.g., '250 poison samples represent only 0.00016% of training tokens for the 13B model', perplexity increases above 200, ASR values at specific poison counts. Table 1 shows benchmark performance differences."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No justification for why 3 random seeds were chosen for pretraining runs or 5 experiments per datapoint for fine-tuning. No power analysis."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Results are reported across 3 training seeds with min/max ranges (Fig. 2) and medians of 5 experiments per datapoint highlighted in fine-tuning figures. 95% CIs shown in Fig. 10a."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Control generations (without trigger) serve as baselines. Table 1 compares poisoned vs original vs replicated models on NLP benchmarks. Clean accuracy and near-trigger accuracy provide baseline comparisons."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper builds on and compares with Zhang et al. (2024) and uses contemporary models (Llama-3.1-8B-Instruct, GPT-3.5-turbo). The Pythia suite (2023) is appropriate for the resumed-pretraining methodology."
     86       },
     87       "ablation_study": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Extensive ablations in Section 4: per-batch poison density, poisoned batch frequency, different checkpoints, continued clean training, data ordering (beginning/end/uniform), learning rate effects."
     91       },
     92       "multiple_metrics": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Uses Attack Success Rate (ASR), Clean Accuracy (CA), Near-Trigger Accuracy (NTA), and perplexity increase for pretraining experiments. Also evaluates on standard NLP benchmarks (ARC, PIQA, etc.)."
     96       },
     97       "human_evaluation": {
     98         "applies": false,
     99         "answer": false,
    100         "justification": "Human evaluation is not relevant to this paper's claims about data poisoning attack dynamics. Automated metrics (perplexity, ASR, CA, NTA) are appropriate."
    101       },
    102       "held_out_test_set": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Evaluation uses 'held-out Pile prefixes' for pretraining experiments. The harmful dataset is split into training and test sets. Test questions are filtered to avoid similarity with training set."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Results are broken down by model size (600M to 13B), dataset size (Opt/2, Opt, 2xOpt), poison count (100, 250, 500), attack type (DoS vs language-switch vs harmful QA), and training stage (pretraining vs fine-tuning)."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "100 poisoned documents failed to produce successful attacks (Appendix D). Poisoning at the end of training with only 20 samples was ineffective. Backdoors degrade under continued clean training."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "100-poison experiments are reported as unsuccessful. Appendix I shows backdoors do not persist through alignment training. Fig. 17b shows continued clean fine-tuning degrades ASR to near-zero."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Abstract claims are supported: 250 documents compromise models across scales (Fig. 2), largest experiments to date (600M-13B on Chinchilla-optimal tokens), fine-tuning shows same dynamics (Fig. 6-7)."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper's core causal claim is that absolute poison count (not percentage) determines attack success. This is supported by controlled experiments varying poison count while fixing dataset size, and varying dataset size while fixing poison count. Ablations isolate individual variables."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper acknowledges testing only DoS and language-switch backdoors, explicitly noting 'we explore a narrow subset of backdoors' and calling for future work on more complex attack vectors. The title uses 'near-constant' not 'constant'."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 6 discusses persistence through post-training as an open question. The paper considers that larger models' sample efficiency could explain constant poison requirements. Per-batch density effects are hypothesized as related to gradient steps."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper is clear about what it measures (perplexity increase, ASR, CA, NTA) and what it claims (attack feasibility). The DoS attack is explicitly chosen because 'it can be measured during pretraining' and the language-switch attack is justified as a measurable proxy for out-of-distribution backdoor behavior (Appendix B)."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Specific model versions are given: Llama-3.1-8B-Instruct, GPT-3.5-turbo, Pythia-6.9b-deduped, Claude-Sonnet-3.5-20241022. Model sizes (600M to 13B) are specified for pretrained models."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "The trigger phrase is stated ('Servius Astrumando Harmoniastra') but the actual evaluation prompts, StrongReject grader prompt adaptation, and fine-tuning system messages are described rather than fully reproduced. Only the German fine-tuning system message is quoted."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Batch size of 32, learning rate 5×10⁻⁵ for Llama fine-tuning, 20 tokens per parameter for Chinchilla-optimal, effective batch size 1024 for Pythia, temperature 1 for sampling. LR multiplier experiments for GPT-3.5."
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "No agentic scaffolding is used. The paper trains and evaluates language models directly."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Poisoned data generation is described in detail: random characters from Pile documents + trigger + gibberish tokens (Section 3.1), language-switch poison construction (Appendix B), harmful dataset filtering pipeline (Appendix E) with multiple stages."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 6 (Discussion and Conclusion) contains substantive discussion of limitations, including persistence through post-training, narrow subset of backdoors tested, and need for more defense research."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Specific threats discussed: backdoors may not persist through realistic post-training (Section 6), only tested DoS and language-switch attacks, continued clean training can degrade success, learning rate affects required poison count, practical feasibility of fine-tuning poisoning 'less well studied'."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 6 explicitly states: 'our work has not assessed how likely are backdoors to persist through realistic (safety) post-training' and 'We explore a narrow subset of backdoors in our work.' Data requirements for different behaviors are flagged as future work."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "Raw experimental data (training logs, per-run results, poisoned datasets) are not released. Only aggregated results in figures and tables are available."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Data collection is well-described: poisoned data constructed from Pile documents with trigger + gibberish (Section 3.1), harmful QA from StrongReject with LLM augmentation (Appendix E), clean data from specific sources."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants. Data sources are standard public datasets (Pile, StrongReject) or model-generated."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The pipeline from source data to poisoned training sets is documented: Pile documents → insert trigger → append gibberish. For fine-tuning: StrongReject questions → augment with Claude → filter refusals → filter test similarity → collect refusals/harmful answers → score with GPT-4o."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Acknowledgments section lists Baskerville HPC (EPSRC grants EP/T022221/1, EP/W032244/1), Isambard-AI (DSIT via UKRI, STFC), and 'His Majesty's Government.'"
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "All affiliations are clearly listed: UK AI Security Institute, Anthropic, Alan Turing Institute, OATML University of Oxford, ETH Zurich."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Funders are UK government entities (EPSRC, DSIT, STFC) and academic institutions with no financial stake in whether poisoning attacks succeed or fail."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests statement is present. Authors from Anthropic work at a company with commercial interest in AI safety, but this conflict is not explicitly declared."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "The paper trains models from scratch on known datasets (Pile) or resumes from Pythia checkpoints with known training data. It does not evaluate a pre-trained model's capability on benchmarks where contamination would be a concern."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "Same as above — the paper is about injecting backdoors, not evaluating model knowledge on benchmarks."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "Same as above — contamination in the benchmark sense is not applicable to poisoning attack research."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No inference or evaluation costs reported despite using GPT-3.5-turbo API for fine-tuning and GPT-4o for evaluation scoring."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No total compute budget stated despite training 72 models from scratch (600M to 13B parameters) and running extensive fine-tuning experiments. Hardware is acknowledged (Baskerville, Isambard-AI) but GPU hours are not quantified."
    302       }
    303     },
    304     "experimental_rigor": {
    305       "seed_sensitivity_reported": {
    306         "applies": true,
    307         "answer": true,
    308         "justification": "Pretraining experiments use 3 different random seeds per configuration (72 models total). Min/max ranges across seeds are shown in figures. Fine-tuning experiments use 5 runs per datapoint with medians highlighted."
    309       },
    310       "number_of_runs_stated": {
    311         "applies": true,
    312         "answer": true,
    313         "justification": "'We train each configuration with 3 different random seeds, producing 72 models in total' (Section 3.1). Fine-tuning: 'we highlight the median of 5 experiments per datapoint.'"
    314       },
    315       "hyperparameter_search_budget": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "Learning rate is fixed at 5×10⁻⁵ for Llama experiments without justification for this choice. No hyperparameter search budget is reported. Appendix F.3 varies LR but as an ablation, not systematic search."
    319       },
    320       "best_config_selection_justified": {
    321         "applies": true,
    322         "answer": true,
    323         "justification": "The paper does not cherry-pick configurations — all configurations (100, 250, 500 poisons × model sizes × dataset sizes) are reported including failures (100 poisons). Medians across runs are used, not best runs."
    324       },
    325       "multiple_comparison_correction": {
    326         "applies": false,
    327         "answer": false,
    328         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    329       },
    330       "self_comparison_bias_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The paper evaluates its own attack methodology without discussing self-comparison bias. All baselines and attack implementations are by the authors."
    334       },
    335       "compute_budget_vs_performance": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "While results are shown across model scales, the total compute cost of each configuration is not reported. No performance-per-FLOP or cost comparison is provided."
    339       },
    340       "benchmark_construct_validity": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "Appendix B extensively discusses why the language-switch attack is a valid proxy for out-of-distribution backdoor behavior, with explicit reasoning about construct validity of the evaluation metrics. The DoS attack choice is also justified."
    344       },
    345       "scaffold_confound_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "No scaffolding is involved — the paper trains and evaluates models directly."
    349       }
    350     }
    351   },
    352   "claims": [
    353     {
    354       "claim": "Poisoning attacks require a near-constant number of poisoned documents regardless of model and dataset size.",
    355       "evidence": "Fig. 2 shows 250 poison documents produce similar perplexity increases across models from 600M to 13B parameters trained on Chinchilla-optimal tokens (Section 3.2). The 13B model trains on 20× more clean data than the 600M model.",
    356       "supported": "strong"
    357     },
    358     {
    359       "claim": "As few as 250 poisoned documents can successfully backdoor models up to 13B parameters for denial-of-service attacks.",
    360       "evidence": "Fig. 2 (left) shows perplexity increases exceeding 200 for all model sizes with 250 poisons. 100 poisons were not successful (Appendix D). Replicated across 3 seeds per configuration.",
    361       "supported": "strong"
    362     },
    363     {
    364       "claim": "The absolute number of poisoned samples, not the percentage, determines attack success during fine-tuning.",
    365       "evidence": "Fig. 6a shows similar ASR for Llama-3.1-8B-Instruct across dataset sizes of 1,000 to 100,000 (2 orders of magnitude) for the same number of poisons. Fig. 7 replicates with GPT-3.5-turbo.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Continued clean training can degrade backdoor attack success.",
    370       "evidence": "Fig. 5 shows ASR degradation under continued clean pretraining. Fig. 17b shows fine-tuning ASR degrades to near-zero after 100k clean datapoints. Appendix I shows simulated alignment removes backdoors.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Per-batch poisoning density and frequency have minimal effect on total poisoned samples needed for attack success.",
    375       "evidence": "Fig. 4 shows overlapping ASR curves when plotted against poisoned samples seen, across different density (10-50%) and frequency (every 1-10 steps) combinations. Some effect at higher density requiring more samples.",
    376       "supported": "moderate"
    377     }
    378   ],
    379   "methodology_tags": ["benchmark-eval"],
    380   "key_findings": "Poisoning attacks on LLMs require a near-constant number of poisoned documents (as few as 250) regardless of model size (600M to 13B) and dataset size, challenging the assumption that attacks become harder at scale. The absolute count of poisoned samples, not the poisoning percentage, determines attack success in both pretraining and fine-tuning settings. However, backdoors introduced during pretraining do not persist through alignment/safety training, suggesting post-training defenses may be effective.",
    381   "red_flags": [
    382     {
    383       "flag": "No code or data release",
    384       "detail": "The paper intentionally withholds code and data for safety reasons, but this means the core experimental claims cannot be independently verified. The 72 pretrained models and poisoned datasets are not available."
    385     },
    386     {
    387       "flag": "Limited backdoor types tested",
    388       "detail": "Only DoS (gibberish) and language-switch backdoors are tested during pretraining. These are relatively simple distribution shifts. More complex and dangerous backdoors (harmful instruction compliance, agentic actions) are acknowledged but not tested at scale."
    389     },
    390     {
    391       "flag": "No compute budget reported",
    392       "detail": "Training 72 models from 600M to 13B parameters on Chinchilla-optimal tokens represents enormous compute, but no GPU hours, costs, or carbon footprint are reported."
    393     }
    394   ],
    395   "cited_papers": [
    396     {
    397       "title": "Persistent pre-training poisoning of LLMs",
    398       "authors": ["Yiming Zhang", "Javier Rando", "Ivan Evtimov"],
    399       "year": 2024,
    400       "arxiv_id": "2410.13722",
    401       "relevance": "Directly comparable pretraining poisoning work; pretrained LLMs showing backdoors persist through SFT and DPO."
    402     },
    403     {
    404       "title": "Poisoning web-scale training datasets is practical",
    405       "authors": ["Nicholas Carlini", "Matthew Jagielski"],
    406       "year": 2023,
    407       "arxiv_id": "2302.10149",
    408       "relevance": "Establishes practical feasibility of poisoning LLM pretraining data from the public web."
    409     },
    410     {
    411       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    412       "authors": ["Evan Hubinger", "Carson Denison"],
    413       "year": 2024,
    414       "arxiv_id": "2401.05566",
    415       "relevance": "Studies backdoor persistence through safety training; larger models retain backdoors better."
    416     },
    417     {
    418       "title": "Data poisoning in LLMs: Jailbreak-tuning and scaling laws",
    419       "authors": ["Dillon Bowen", "Brendan Murphy"],
    420       "year": 2024,
    421       "arxiv_id": "2408.02946",
    422       "relevance": "Studies how data poisoning effectiveness scales with model size, concluding larger models more susceptible."
    423     },
    424     {
    425       "title": "Universal jailbreak backdoors from poisoned human feedback",
    426       "authors": ["Javier Rando", "Florian Tramèr"],
    427       "year": 2023,
    428       "arxiv_id": "2311.14455",
    429       "relevance": "Demonstrates backdoor attacks against RLHF training stage of language models."
    430     },
    431     {
    432       "title": "A StrongReject for empty jailbreaks",
    433       "authors": ["Alexandra Souly", "Qingyuan Lu"],
    434       "year": 2024,
    435       "arxiv_id": "2402.10260",
    436       "relevance": "Provides the evaluation framework (StrongReject grader) used for measuring harmful compliance in fine-tuning experiments."
    437     },
    438     {
    439       "title": "Fine-tuning aligned language models compromises safety, even when users do not intend to",
    440       "authors": ["Xiangyu Qi", "Yi Zeng"],
    441       "year": 2023,
    442       "arxiv_id": "2310.03693",
    443       "relevance": "Shows fine-tuning can compromise LLM safety alignment, relevant to understanding post-training vulnerability."
    444     },
    445     {
    446       "title": "Training compute-optimal large language models",
    447       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud"],
    448       "year": 2022,
    449       "arxiv_id": "2203.15556",
    450       "relevance": "Chinchilla scaling laws used to determine optimal training data sizes in the experimental design."
    451     },
    452     {
    453       "title": "PoisonBench: Assessing large language model vulnerability to data poisoning",
    454       "authors": ["Tingchen Fu", "Mrinank Sharma"],
    455       "year": 2024,
    456       "arxiv_id": "2410.08811",
    457       "relevance": "Benchmarking framework for backdoor attacks during preference learning including DPO."
    458     },
    459     {
    460       "title": "Pythia: A suite for analyzing large language models across training and scaling",
    461       "authors": ["Stella Biderman", "Hailey Schoelkopf"],
    462       "year": 2023,
    463       "relevance": "Provides the open-source model suite with intermediate checkpoints used for resumed pretraining experiments."
    464     }
    465   ]
    466 }

Impressum · Datenschutz