ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (17897B)


      1 {
      2   "paper": {
      3     "title": "On The Dangers of Poisoned LLMs In Security Automation",
      4     "authors": ["Patrick Karlsen", "Even Eilertsen"],
      5     "year": 2025,
      6     "arxiv_id": "2511.02600"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": false,
     13         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "The datasets (baseline, poison, poisoned) are described as synthetically constructed but no download link or release is provided."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The paper mentions using the Unsloth library and specific model names but provides no requirements.txt, library versions, or environment setup details."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No step-by-step reproduction instructions, scripts, or README are provided."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Results are reported as single point estimates (e.g., '96% accuracy', '82.7% accuracy') with no confidence intervals or error bars."
     36       },
     37       "significance_tests": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper compares performance across model states and claims differences but uses no statistical significance tests."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Raw accuracy numbers are reported but no formal effect sizes (Cohen's d, odds ratios, etc.) are provided. The '4.5x performance increase' is a raw ratio, not a standard effect size measure."
     46       },
     47       "sample_size_justified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The dataset sizes (1000 baseline, 200 poison) are stated but no justification or power analysis is provided for why these sizes are sufficient."
     51       },
     52       "variance_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No variance, standard deviation, or results across multiple runs are reported. All results appear to be single-run."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The paper compares three model states: zero-shot baseline, cleanly fine-tuned, and poisoned fine-tuned (Section IV.C-E)."
     63       },
     64       "baselines_contemporary": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The baselines are the same models (Llama 3.1 8B, Qwen3 4B) in different training states, which is appropriate for this type of poisoning study."
     68       },
     69       "ablation_study": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No ablation study is conducted. The paper does not vary the number of poisoned samples, trigger types, or other components to understand their individual contributions."
     73       },
     74       "multiple_metrics": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "Only accuracy and misclassification rate are reported. No precision, recall, F1, or other metrics are provided."
     78       },
     79       "human_evaluation": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "This is a model poisoning study with automated classification; human evaluation of system outputs is not relevant to the claims."
     83       },
     84       "held_out_test_set": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper uses train:test splits (9:1 for baseline and poisoned datasets, 1:1 for poison dataset) with results reported on validation/test sets."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Results are broken down by model (Llama vs Qwen) and by test set (clean validation vs poison test set), showing performance across categories."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The Limitations section (V.D) discusses the simplified nature of the dataset and acknowledges the gap to real-world complexity."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper notes that the cleanly fine-tuned Qwen3 model already had a 71% misclassification rate on the poison test set, an unexpected negative finding discussed in Section V.A."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The abstract claims about poisoned models dismissing true positive alerts from a specific user are supported by the 100% misclassification rate result in Section IV.E."
    110       },
    111       "causal_claims_justified": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper makes causal claims about poisoning causing misclassification. The experimental design (comparing clean vs poisoned fine-tuning on same models) provides adequate controlled manipulation to support these claims."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The title and abstract make broad claims about 'security automation' but results are on a simplified synthetic dataset with one trigger type. The paper does acknowledge synthetic data limitations but the title/framing overstates scope."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper does not discuss alternative explanations for the results (e.g., whether the bias could be an artifact of the simplified dataset structure rather than true poisoning)."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Specific model identifiers are provided: 'unsloth/Meta-Llama-3.1-8B' and 'unsloth/Qwen3-4b-Instruct-2507' (Section IV.B)."
    132       },
    133       "prompts_provided": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper describes the alert format fields (Name, Alert text, Label) but does not provide the actual prompt text used for classification."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "No hyperparameters (learning rate, epochs, batch size, optimizer, etc.) are reported for the fine-tuning process."
    142       },
    143       "scaffolding_described": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "The paper shows an agentic investigation loop diagram (Fig. 1) but the core experiment is direct model fine-tuning and classification, not an agentic scaffold."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section IV.A describes the dataset construction: field structure, class distributions (850 benign/150 malicious for baseline; 200 malicious mislabeled for poison), and how datasets were combined."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section V.D 'Limitations' provides a dedicated subsection discussing the simplified synthetic dataset."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "The limitations section makes specific points: 'our experiment utilized a simplified synthetic dataset', 'Each alert was structured identically', 'real-world security logs and systems are far more complex' (Section V.D)."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper does not explicitly state what the results do NOT show. It acknowledges dataset simplicity but does not bound the claims (e.g., does not state the results don't apply to non-trigger-based attacks, different domains, etc.)."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The synthetic datasets are not released or made available for verification."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section IV.A describes the synthetic dataset construction: alert fields, class distributions, and the poisoning approach (mislabeling all Alice alerts as benign)."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": false,
    184         "answer": false,
    185         "justification": "No human participants; data is synthetically generated."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The pipeline from dataset construction to combined poisoned dataset is documented in Section IV.A with sizes and splits for each stage."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding disclosure or acknowledgments section is present in the paper."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Author affiliations are listed: University of Agder and University of Oslo. No product being evaluated is affiliated with these institutions."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information is disclosed, so independence cannot be assessed."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests or financial interests statement is present in the paper."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": false,
    218         "answer": false,
    219         "justification": "The paper tests a poisoning attack on fine-tuned models, not evaluating pre-trained model capability on a benchmark. The evaluation measures the effect of poisoned training data, not model knowledge."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": false,
    223         "answer": false,
    224         "justification": "Same as above; the study is about poisoning effects, not benchmark evaluation of pre-trained knowledge."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "Same as above; contamination in the benchmark sense is not relevant to this poisoning study."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No human participants in this study."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants in this study."
    242       },
    243       "demographics_reported": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "randomization_described": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "blinding_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "attrition_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "No inference cost, latency, or time measurements are reported."
    274       },
    275       "compute_budget_stated": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "The paper mentions models were chosen 'for fitting in the available compute budget' but does not state what that budget was (GPU hours, hardware, training time)."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "A targeted data poisoning attack can create a 100% effective backdoor in LLM-based security classifiers while maintaining improved general performance over baseline.",
    285       "evidence": "Poisoned Llama-3.1 and Qwen3 models both achieved 82.7% accuracy on validation (4.5x over baseline) while achieving 100% misclassification on poison test set (Section IV.E).",
    286       "supported": "moderate"
    287     },
    288     {
    289       "claim": "The poisoning technique generalizes across different model architectures and sizes (8B Llama vs 4B Qwen).",
    290       "evidence": "Both models achieved 100% misclassification on poison test set after poisoned fine-tuning (Section IV.E). However, the cleanly fine-tuned Qwen already had 71% misclassification (Section IV.D), weakening the cross-architecture generalization claim.",
    291       "supported": "weak"
    292     },
    293     {
    294       "claim": "The base models have no pre-existing bias that would enable the backdoor.",
    295       "evidence": "Both base models achieved 0% misclassification on the poison test set in zero-shot (Section IV.C).",
    296       "supported": "strong"
    297     }
    298   ],
    299   "methodology_tags": ["benchmark-eval", "case-study"],
    300   "key_findings": "The paper demonstrates that fine-tuning LLMs on a poisoned dataset containing mislabeled security alerts from a target user ('Alice') can create a 100% effective backdoor in both Llama-3.1-8B and Qwen3-4B models. The poisoned models maintain improved general classification accuracy (82.7%) over zero-shot baselines (17-18%) while completely misclassifying all alerts from the target user. However, the cleanly fine-tuned Qwen3 model already had a 71% misclassification rate on the target alerts, complicating the cross-architecture generalization claim.",
    301   "red_flags": [
    302     {
    303       "flag": "Oversimplified synthetic dataset",
    304       "detail": "The dataset uses identically structured alerts with simple fields (Name, Alert text, Label). Real security logs have varied formats, contextual information, and noise. The attack success may not transfer to realistic settings."
    305     },
    306     {
    307       "flag": "No statistical rigor",
    308       "detail": "All results are single-run point estimates with no confidence intervals, variance reporting, or significance tests. With small datasets (1000 baseline, 200 poison), results could be unstable."
    309     },
    310     {
    311       "flag": "Missing hyperparameters",
    312       "detail": "No training hyperparameters (learning rate, epochs, batch size) are reported, making reproduction impossible."
    313     },
    314     {
    315       "flag": "Weak cross-architecture claim",
    316       "detail": "The Qwen3 model already misclassified 71% of target alerts when cleanly fine-tuned, meaning the poisoning only added 29 percentage points of misclassification for Qwen. The paper frames both models as showing the attack 'generalizes' but the Qwen result is confounded by pre-existing model weakness."
    317     }
    318   ],
    319   "cited_papers": [
    320     {
    321       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    322       "authors": ["Evan Hubinger"],
    323       "year": 2024,
    324       "arxiv_id": "2401.05566",
    325       "relevance": "Foundational work on persistent backdoors in LLMs that survive safety training, directly relevant to AI safety and deceptive alignment."
    326     },
    327     {
    328       "title": "Poisoning Attacks on LLMs Require a Near-constant Number of Poison Samples",
    329       "authors": ["Alexandra Souly"],
    330       "year": 2025,
    331       "arxiv_id": "2510.07192",
    332       "relevance": "Demonstrates scaling properties of poisoning attacks on LLMs, relevant to understanding LLM security vulnerabilities."
    333     },
    334     {
    335       "title": "A Systematic Review of Poisoning Attacks Against Large Language Models",
    336       "authors": ["Neil Fendley"],
    337       "year": 2025,
    338       "arxiv_id": "2506.06518",
    339       "relevance": "Survey of LLM poisoning attacks, relevant to understanding the threat landscape for AI security."
    340     },
    341     {
    342       "title": "Poisoning Language Models During Instruction Tuning",
    343       "authors": ["Alexander Wan"],
    344       "year": 2023,
    345       "arxiv_id": "2305.00944",
    346       "relevance": "Demonstrates poisoning during instruction tuning, a key attack vector for LLM-based systems."
    347     },
    348     {
    349       "title": "AgentPoison: Red-teaming LLM Agents via Poisoning Memory or Knowledge Bases",
    350       "authors": ["Zhaorun Chen"],
    351       "year": 2024,
    352       "arxiv_id": "2407.12784",
    353       "relevance": "Extends poisoning attacks to LLM agent memory and knowledge bases, relevant to agentic AI security."
    354     },
    355     {
    356       "title": "Instructions as Backdoors: Backdoor Vulnerabilities of Instruction Tuning for Large Language Models",
    357       "authors": ["Jiashu Xu"],
    358       "year": 2024,
    359       "arxiv_id": "2305.14710",
    360       "relevance": "Demonstrates backdoor vulnerabilities specific to instruction-tuned LLMs."
    361     },
    362     {
    363       "title": "Persistent Pre-Training Poisoning of LLMs",
    364       "authors": ["Yiming Zhang"],
    365       "year": 2024,
    366       "arxiv_id": "2410.13722",
    367       "relevance": "Studies poisoning persistence through pre-training, relevant to understanding attack durability."
    368     }
    369   ]
    370 }

Impressum · Datenschutz