ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24171B)


      1 {
      2   "paper": {
      3     "title": "PoTS: Proof-of-Training-Steps for Backdoor Detection in Large Language Models",
      4     "authors": ["Issam Seddik", "Sami Souihi", "Mohamed Tamaazousti", "Sara Tucci Piergiovanni"],
      5     "year": 2025,
      6     "venue": "2025 3rd International Conference on Foundation and Large Language Models (FLLM)",
      7     "arxiv_id": "2510.15106",
      8     "doi": "10.1109/FLLM67465.2025.11391059"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "PoTS proposes a step-wise verification protocol that detects backdoor attacks during LLM training by analyzing the sensitivity of the LM-Head layer to input perturbations. The method detects attacks with as little as 10% batch poisoning rate in most tested models, with verification 3x faster than a training step. Testing across Llama-3.2-1B, Falcon-3-1B, and Qwen-2.5 (0.5B/1.5B), the authors show that verifying only the final layer(s) reduces verification cost by up to 70% while maintaining detection reliability, and hidden malicious steps are even more detectable than single-step poisoning.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available datasets: Stanford Alpaca (Taori et al., 2023) and AdvBench (Zou et al., 2023). Both are standard public benchmarks."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions using a single NVIDIA H100 GPU but provides no requirements.txt, library versions, or environment specification beyond hardware."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions, README, or scripts are provided."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Table 1 reports asymmetric uncertainty intervals for all ASR values, e.g., '56.4(−30.7,+30.7)'."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No formal statistical significance tests (p-values, t-tests, etc.) are used. Claims of difference between models or conditions rely on comparing point estimates."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "ASR values are reported with baseline context (clean BPR=0% vs. various poisoning rates), enabling assessment of effect magnitude. E.g., Llama ASR goes from 0.2% clean to 56.4% at 10% BPR (Table 1)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for choosing 500/200 (Alpaca) or 400/100 (AdvBench) splits, or for the number of experimental runs."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Table 1 reports uncertainty ranges across multiple runs for all ASR measurements. Section 4.1 states 'multiple runs' with 'reporting standard deviations to ensure reproducibility.'"
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Clean (BPR=0%) baselines are included in Table 1. The method is positioned against Proof-of-Learning approaches conceptually, and the honest-case distance ratio (=1) serves as the detection baseline in Figure 3."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No direct numerical comparison against competing verification methods (PoL, anti-backdoor learning, etc.). The paper compares only against the clean/honest baseline, not against alternative detection approaches."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Figure 3 ablates the number of verified layers: LM-Head only, LMH+1L, LMH+3L, LMH+5L, showing how additional layers improve detection sensitivity."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics: ASRtrigger, ASRclean (Table 1), L2-norm distance ratio (Figure 3), and verification vs. training time comparison (Figure 4)."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is irrelevant to the claims — the paper evaluates an automated backdoor detection protocol."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Section 4.1: 'randomly selecting 500 instances for training while preserving 200 for testing' (Alpaca) and 'selecting 400 samples for training and retaining 100 for testing' (AdvBench)."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results broken down by model (4 LLMs), attack type (Targeted Refusal vs. Jailbreaking), and batch poisoning rate (10%-75%) in Table 1 and Figure 3."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Llama 3.2-1B shows weaker LM-Head-only detection (shifts appear only at 50% BPR in Figure 3). Figure 6 shows performance degradation when verification exceeds one training step. LoRA incompatibility is discussed in Section 5."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Figure 6 shows detectability degradation with multi-step verification. Llama 3.2-1B's weaker LM-Head-only sensitivity is reported. LoRA limitation is acknowledged."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about detection at 10% BPR (supported by Table 1 and Figure 3), 3x faster verification (supported by Figure 4), and reducing ASR (supported by the protocol design preventing high-BPR scenarios) are all backed by results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims ('our method detects attacks', 'examining final layers is sufficient') are supported by controlled experiments with known ground truth (they inject the backdoor themselves and measure detection). This constitutes adequate causal design."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title says 'Large Language Models' but experiments use only 0.5B-1.5B parameter models. While Section 5 acknowledges 'validation with larger models... remains necessary,' the title and abstract do not bound the claims to small models."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "Section 5 discusses limitations (LoRA, hardware consistency, model scale) but does not consider alternative explanations for why the method works (e.g., could the detection be an artifact of the specific attack method rather than a general property of LM-Head sensitivity?)."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures L2 distance between auditor and trainer weights as the detection signal. This is directly what the protocol uses for detection — no proxy gap between measurement and claim."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific model versions stated: 'Llama-3.2-1B-Instruct', 'Falcon-3-1B-Instruct', 'Qwen-2.5-0.5B-Instruct', 'Qwen-2.5-1.5B-Instruct' (Section 4.1)."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Section 4.1 mentions 'a fixed template (\"alpaca\")' but does not provide the actual prompt text used."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.1 reports: learning rate 5e-5, AdamW optimizer, batch size 16,384 tokens, top-p 0.75, max sequence length 128 tokens."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The method is a direct training/verification protocol."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 4.1 describes the sampling procedure for BPR, dataset splits (train/test), prompt formatting, tokenization standardization, and the BadNets attack injection process."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5 'Discussion and Limitations' provides substantive discussion of multiple specific limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 5 identifies specific threats: LoRA incompatibility, identical hardware requirement between trainer and auditor, and limitation to small models and fine-tuning only."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 5: 'experiments focus on fine-tuning small LLMs, validation with larger models and alternative training procedures (pre-training, reinforcement learning) remains necessary to establish broader applicability.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (model weights, training logs, distance measurements) is released for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4.1 describes data sources (Stanford Alpaca, AdvBench), sample sizes, train/test splits, and the BPR sampling procedure."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data comes from standard public benchmarks (Stanford Alpaca, AdvBench)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 4.1 documents the pipeline: dataset selection → sampling with variable BPR → prompt formatting with alpaca template → tokenization → single-step training → verification comparison."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Section 7 acknowledges 'CEA List FactoryIA supercomputer, with financial support from the Île-de-France Regional Council.'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors listed as Université Paris-Saclay, CEA LIST, Palaiseau, France. They are not evaluating their own commercial product."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "CEA (French atomic energy commission) and Île-de-France Regional Council are public research funders with no financial stake in backdoor detection outcomes."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial disclosure statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper tests a verification/defense protocol, not a pre-trained model's capability on knowledge benchmarks. Contamination is not relevant."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same as above — the paper evaluates a defense method, not model knowledge on benchmarks."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Same as above — contamination concepts do not apply to this defense evaluation."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Figure 4 compares one-step training vs. verification time across all LLMs. The paper states verification is '3× faster than a training step' and up to 70% cost reduction."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Hardware is stated (single NVIDIA H100 GPU) but total GPU hours or compute budget for all experiments is not quantified."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Table 1 reports results across multiple runs with uncertainty ranges, indicating seed/run variability is captured."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The paper says 'multiple runs' (Section 4.1) but never states the exact number of runs."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No mention of hyperparameter search or how the chosen settings (lr=5e-5, batch size, etc.) were selected."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "Hyperparameters appear fixed across experiments with no explanation of how they were chosen or whether alternatives were tried."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests are performed, so no multiple comparison correction is applied despite comparing across 4 models × 2 attacks × 4+ BPR levels."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No acknowledgment of author-evaluation bias. The authors evaluate their own proposed protocol without independent evaluation or discussion of this bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Figure 4 directly compares verification cost (time) against training cost, and Figure 3 shows detection performance as a function of layers verified (more layers = more compute)."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper does not discuss whether ASR on BadNets-style attacks adequately represents real-world backdoor threats, or whether the chosen benchmarks (Alpaca, AdvBench) capture realistic attack scenarios."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved in the evaluation — the method directly compares weight tensors."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "The paper evaluates a defense protocol, not model knowledge on benchmarks. Temporal leakage is not applicable — the models are fine-tuned in a controlled experiment."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "The evaluation measures weight divergence, not model predictions on knowledge benchmarks. Feature leakage concepts do not apply."
    349       },
    350       "non_independence_addressed": {
    351         "applies": false,
    352         "answer": false,
    353         "justification": "The evaluation is a controlled experiment with known poisoned/clean splits. Non-independence in the benchmark sense is not applicable."
    354       },
    355       "leakage_detection_method": {
    356         "applies": false,
    357         "answer": false,
    358         "justification": "Not applicable — the paper tests a defense method, not a model's benchmark performance where leakage could inflate scores."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Backdoor attacks can achieve high ASR with as little as 10% batch poisoning rate for Targeted Refusal attacks.",
    365       "evidence": "Table 1: Llama3.2-1B achieves 56.4% ASRtrigger at 10% BPR for Targeted Refusal. Most models reach 69-88% ASR at 25% BPR.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "PoTS detects backdoor attacks by analyzing only the LM-Head layer, with verification 3x faster than a training step.",
    370       "evidence": "Figure 3 shows ratio shifts at 10% BPR for most models (LM-Head only). Figure 4 shows verification time is up to 3x faster than training, particularly for Falcon 3-1B.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Adding posterior layers (LMH+1L, +3L, +5L) improves detection sensitivity, especially for models where LM-Head-only detection is weak.",
    375       "evidence": "Figure 3: Llama 3.2-1B shows minimal LM-Head-only sensitivity but clear detection with additional layers. Adding 3.7% of parameters (Falcon) significantly improves detection.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Hidden malicious steps are more detectable than single-step poisoning, with attacker distances exceeding single-step poisoning by >3x with 3 hidden steps.",
    380       "evidence": "Figure 5 shows Qwen 0.5B LM-Head sensitivity increases substantially with hidden steps across all BPR values.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Jailbreaking attacks require substantially higher poisoning rates than Targeted Refusal to achieve comparable effectiveness.",
    385       "evidence": "Table 1: Jailbreaking ASR remains low (<20%) at 10-25% BPR for most models, while Targeted Refusal reaches 56-88% ASR at 10-25% BPR.",
    386       "supported": "strong"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Title overclaims scope",
    392       "detail": "Paper title references 'Large Language Models' but experiments use only 0.5B-1.5B parameter models, which are small by current standards. The limitations section acknowledges this but the title does not."
    393     },
    394     {
    395       "flag": "No comparison with competing detection methods",
    396       "detail": "The paper positions against Proof-of-Learning and other verification methods conceptually but provides no direct numerical comparison. The only baseline is the clean/honest case."
    397     },
    398     {
    399       "flag": "Single attack method tested",
    400       "detail": "Only BadNets-style attacks are evaluated. More sophisticated backdoor methods (e.g., clean-label attacks, syntactic triggers) are not tested, limiting the generalizability of detection claims."
    401     },
    402     {
    403       "flag": "Number of runs not stated",
    404       "detail": "Table 1 reports uncertainty intervals from 'multiple runs' but never specifies how many runs were conducted, making it impossible to assess statistical power."
    405     },
    406     {
    407       "flag": "Identical hardware assumption",
    408       "detail": "The method requires identical hardware between trainer and auditor (acknowledged in limitations), which severely limits practical deployment scenarios."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    414       "authors": ["Evan Hubinger", "Carson Denison"],
    415       "year": 2024,
    416       "relevance": "Demonstrates that backdoors persist through safety alignment in LLMs, motivating verification-time detection approaches."
    417     },
    418     {
    419       "title": "BackdoorLLM: A comprehensive benchmark for backdoor attacks on large language models",
    420       "authors": ["Yige Li", "Hanxun Huang", "Yunhan Zhao", "Xingjun Ma", "Jun Sun"],
    421       "year": 2024,
    422       "arxiv_id": "2408.12798",
    423       "relevance": "Provides the backdoor benchmark and ASR evaluation methodology used in this paper's experiments."
    424     },
    425     {
    426       "title": "Proof-of-learning: Definitions and practice",
    427       "authors": ["Hengrui Jia", "Mohammad Yaghini"],
    428       "year": 2021,
    429       "relevance": "Foundational work on model training verification that PoTS extends and improves upon for LLM contexts."
    430     },
    431     {
    432       "title": "Tools for verifying neural models' training data",
    433       "authors": ["Dami Choi", "Yonadav Shavit", "David K Duvenaud"],
    434       "year": 2023,
    435       "relevance": "Prior verification approach for detecting unauthorized training data, which PoTS addresses for backdoor-specific scenarios."
    436     },
    437     {
    438       "title": "What does it take to catch a chinchilla? Verifying rules on large-scale neural network training via compute monitoring",
    439       "authors": ["Yonadav Shavit"],
    440       "year": 2023,
    441       "arxiv_id": "2303.11341",
    442       "relevance": "Addresses verifiability challenges in LLM training environments, directly related to PoTS's verification goals."
    443     },
    444     {
    445       "title": "TrustLLM: Trustworthiness in large language models",
    446       "authors": ["Lichao Sun", "Yue Huang"],
    447       "year": 2024,
    448       "relevance": "Comprehensive framework for LLM trustworthiness that contextualizes backdoor attacks as a safety concern."
    449     },
    450     {
    451       "title": "BadNets: Identifying vulnerabilities in the machine learning model supply chain",
    452       "authors": ["Tianyu Gu", "Brendan Dolan-Gavitt", "Siddharth Garg"],
    453       "year": 2017,
    454       "arxiv_id": "1708.06733",
    455       "relevance": "Foundational backdoor attack method used as the attack technique in PoTS experiments."
    456     },
    457     {
    458       "title": "Persistent pre-training poisoning of LLMs",
    459       "authors": ["Yiming Zhang", "Javier Rando"],
    460       "year": 2024,
    461       "arxiv_id": "2410.13722",
    462       "relevance": "Demonstrates persistence of backdoors through LLM training processes, supporting the need for training-time verification."
    463     },
    464     {
    465       "title": "Learning to poison large language models during instruction tuning",
    466       "authors": ["Yao Qiang", "Xiangyu Zhou"],
    467       "year": 2024,
    468       "arxiv_id": "2402.13459",
    469       "relevance": "Shows how crafted backdoor triggers achieve high ASR during instruction tuning, directly relevant to the attack model PoTS defends against."
    470     }
    471   ]
    472 }

Impressum · Datenschutz