scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23200B)
      1 {
      2   "paper": {
      3     "title": "Safety-Efficacy Trade Off: Robustness against Data-Poisoning",
      4     "authors": ["Diego Granziol"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.00822"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["theoretical", "benchmark-eval"],
     12   "key_findings": "The paper proves that clustered dirty-label poisons induce a rank-one spike in the input Hessian scaling quadratically with attack efficacy, and identifies a 'near-clone' regime for nonlinear kernels where poisoning is effective but spectrally undetectable. Input-gradient regularisation provably contracts poison-aligned eigenmodes at the cost of reduced data-fitting capacity, establishing a fundamental safety-efficacy trade-off. Experiments on CIFAR-10/100 with Pre-ResNet-110 validate the theory, showing that gradient regularisation combined with data augmentation effectively suppresses poisoning.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL or code archive is provided. Appendix E contains implementation code listings but no link to a runnable repository."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper uses publicly available standard benchmarks: MNIST and CIFAR-10/100."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper mentions PyTorch (torch.autograd) and gpytorch for Lanczos but does not provide a requirements.txt, Dockerfile, or version specifications for dependencies."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Appendix E provides detailed code listings (Listings 1-7) and Algorithm 1-2, but there are no step-by-step instructions, README, or runnable scripts to reproduce the full experimental pipeline."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No confidence intervals or error bars are reported. Figures 6-9 and Table 1 show point estimates only."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No statistical significance tests are used. Claims about the safety-efficacy trade-off are based on comparing point estimates across κ and θ values without any formal testing."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Effect sizes are reported in context: e.g., Table 1 shows that at θ=0.02, κ=10^5, increasing epochs from 90→450 yields 'a small (≈2%) improvement in accuracy' while boosting ASR from 6.63% to 70.68%. Figures show absolute ASR and accuracy values across conditions."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification is given for the choice of poison fractions, number of κ values tested, or why CIFAR-10/100 and MNIST are sufficient for validating the theoretical claims."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. All results appear to be single-run."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The κ=0 (no regularisation) baseline is included throughout. Comparisons are made across multiple κ values and with/without data augmentation."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper uses the standard BadNets-style L-shaped poison (Gu et al., 2017) and the WaNet imperceptible warp (Nguyen et al., 2021). As a theoretical paper proposing a new defense framework, comparing against no-defense and varying defense strength is appropriate."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Systematic ablations across κ (regularisation strength), θ (poison fraction), augmentation on/off, epoch count (90 vs 450), and stochastic vs deterministic poisoning. Multiple figures and Table 1 show these variations."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Three metrics are used: attack success rate (ASR), clean test accuracy, and cosine-overlap² between the top Hessian eigenvector and the poison direction."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation is irrelevant to claims about mathematical properties of poisoning attacks and defenses in neural networks."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Clean test accuracy and ASR are reported on the standard CIFAR/MNIST test sets, which are separate from training data."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by dataset (MNIST, CIFAR-10, CIFAR-100), by poison type (rank-1 additive, warp), by training regime (stochastic, deterministic), and across all κ and θ values."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 5.3 investigates the collapse of eigenvector-poison overlap at high θ and discovers the 'poison rotation' phenomenon across eigenvectors. Table 1 shows that increased training undermines the defense (ASR jumps from 6.63% to 70.68%)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Table 1 shows that increased training (450 epochs) severely undermines the defense even at high κ. The paper also notes that data augmentation alone is NOT effective at mitigating poisoning (Section 5.1, Conclusion)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims about rank-one spike (Theorem 3.4), near-clone spectral invisibility (Corollary 3.7), gradient regularisation contraction (Theorem 3.12), and the safety-efficacy trade-off (Theorem 3.9) are all supported by formal proofs and experimental validation in Sections 4-5."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims (e.g., regularisation reduces poisoning efficacy) are supported by both mathematical proofs (Theorems 3.9, 3.12) and controlled experimental ablations varying κ while holding other factors constant."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title claims 'Robustness against Data-Poisoning' broadly but experiments are limited to image classification on MNIST/CIFAR with only two attack types (rank-1 additive and warp). No discussion of text/NLP poisoning, clean-label attacks beyond warp, or more complex architectures like transformers."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 5.3 investigates the eigenvector rotation hypothesis for the overlap collapse phenomenon. Section 3.4 discusses neural collapse as the mechanism enabling the near-clone regime, providing a mechanistic alternative explanation for why poisons are effective."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper directly measures ASR, clean accuracy, and spectral overlap — exactly the quantities it theorizes about. No proxy gap exists between measurements and claims."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Architecture is specified: 'Pre-Residual-110 layer variant (He et al., 2016)' trained from scratch. No pre-trained model versions are involved."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "The paper does not use any prompting; it trains neural networks from scratch."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 5 specifies: learning rate α=0.1, momentum β=0.9, decay ρ=0.1 every e//3 epochs, e∈{90,450}, random 28×28 crop and flip, image normalisation, warp strength ϕ=0.02."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Poisoning procedure documented in Appendix E with code. Normalisation, augmentation (random 28×28 crop and flip), deterministic per-sample seeding (seed=idx+42), and L-shape mask generation are all described."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No dedicated limitations section. Section 7 (Broader Impact) discusses dual-use risk but not methodological limitations of the work."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity are discussed. The gap between kernel theory (infinite-width) and finite-width experiments is acknowledged implicitly but not explicitly analyzed as a threat."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what was NOT tested. No mention of scope limitations regarding attack types, datasets, architectures (e.g., transformers), or domains beyond image classification."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "MNIST and CIFAR-10/100 are publicly available standard benchmarks. However, the specific experimental outputs (trained models, metric logs) are not released."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Standard benchmarks are used with clearly described poisoning procedures. Appendix E provides complete poison dataset implementation including deterministic seeding."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data comes from standard public benchmarks."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The full pipeline from data loading through poisoning, normalisation, training, and Hessian computation is documented in Appendix E with code listings and algorithms."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding source is disclosed anywhere in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliation is disclosed: Mathematical Institute, University of Oxford, UK."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "The paper trains models from scratch on CIFAR/MNIST; it does not evaluate a pre-trained model's capability on benchmarks."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Models are trained from scratch using standard train/test splits. Pre-trained model contamination concerns do not apply."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No pre-trained model is evaluated. Standard supervised learning on public benchmarks with canonical splits."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference cost, latency, or per-example cost is reported despite computing Hessian eigenvectors (expensive operation) and training with gradient regularisation."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No GPU hours, total training time, or hardware specifications are provided."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No multi-seed results are reported. All experiments appear to be single-run."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of experimental runs is never stated."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search budget is reported. The values of κ tested appear chosen but no search procedure is described."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "All tested configurations are shown in figures and tables (all κ values, all θ values), so there is no selective reporting of a single best configuration."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The paper does not compare against other published defense methods (e.g., spectral signatures, activation clustering) and does not acknowledge the limitation of only evaluating their own proposed defense."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "Gradient regularisation adds significant computational cost (second-order gradients with create_graph=True) but this is not quantified or compared to baseline training cost."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No discussion of whether CIFAR-10/100 with simple trigger patterns is representative of real-world data poisoning scenarios."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding involved."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "Models are trained from scratch on standard datasets. Temporal leakage through pre-training does not apply."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": false,
    346         "answer": false,
    347         "justification": "Standard supervised training from scratch. Feature leakage concerns for pre-trained models do not apply."
    348       },
    349       "non_independence_addressed": {
    350         "applies": false,
    351         "answer": false,
    352         "justification": "Standard CIFAR/MNIST train/test splits are used. The non-independence concern for pre-trained model benchmarking does not apply."
    353       },
    354       "leakage_detection_method": {
    355         "applies": false,
    356         "answer": false,
    357         "justification": "No pre-trained model evaluation; standard train/test splits used."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Clustered dirty-label poisons induce a rank-one spike in the input Hessian whose magnitude scales quadratically with attack efficacy.",
    364       "evidence": "Theorem 3.4 proves ΛGN(x0) = Rk(x0,ζ) · (Δf(x0))², establishing the quadratic relationship. Figures 5-6 show empirical cosine overlap increasing with poison fraction θ.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "For nonlinear kernels, there exists a near-clone regime where poisoning is effective but spectrally undetectable.",
    369       "evidence": "Corollary 3.7 proves that when r/ℓ ≪ 1, efficacy remains order-one while curvature vanishes quadratically (O(r²/ℓ⁴)). Figures 1-3 provide experimental evidence of feature-space near-cloning via neural collapse.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Input-gradient regularisation provably reduces data-fitting capacity while protecting against data poisoning.",
    374       "evidence": "Theorem 3.9 proves df(κ) is strictly decreasing in κ. Theorem 3.12 shows exponential compression of Fisher eigenmodes. Figures 6-7 and 10a demonstrate the trade-off experimentally across CIFAR-10/100.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Data augmentation combined with gradient regularisation is effective at mitigating poisoning, while augmentation alone is not.",
    379       "evidence": "Comparison of Figure 6 (no augmentation) vs Figure 7 (with augmentation) shows that augmentation alone doesn't prevent high ASR, but combined with high κ, ASR drops substantially.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Increased training duration improves the safety-efficacy frontier but also boosts poison efficacy.",
    384       "evidence": "Table 1 shows at θ=0.02, κ=10⁵: 90 epochs gives 75.17% accuracy/6.63% ASR, while 450 epochs gives 79.05% accuracy/70.68% ASR — a ~2% accuracy gain but catastrophic defense failure.",
    385       "supported": "strong"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "No variance or multi-seed results",
    391       "detail": "All experiments appear to be single-run with no error bars, confidence intervals, or multi-seed analysis. For a paper making claims about fundamental trade-offs, the stability of results across random initialisations is unknown."
    392     },
    393     {
    394       "flag": "No comparison against existing defenses",
    395       "detail": "The paper proposes gradient regularisation as a defense but does not compare against any existing defense methods (spectral signatures, activation clustering, STRIP, Neural Cleanse, etc.). Related work discusses them but no empirical comparison is made."
    396     },
    397     {
    398       "flag": "Limited attack diversity",
    399       "detail": "Only two attack types tested (rank-1 additive L-shape and warp), both dirty-label. No clean-label attacks, no adaptive attacks against the proposed defense, no text-domain attacks."
    400     },
    401     {
    402       "flag": "No limitations section",
    403       "detail": "The paper lacks any discussion of methodological limitations, scope boundaries, or threats to validity despite significant gaps between the theoretical model (infinite-width kernels) and experimental setting (finite-width ResNets)."
    404     }
    405   ],
    406   "cited_papers": [
    407     {
    408       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    409       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    410       "year": 2024,
    411       "arxiv_id": "2401.05566",
    412       "relevance": "Demonstrates deceptive alignment in LLMs — directly relevant to AI safety and backdoor persistence."
    413     },
    414     {
    415       "title": "BadEdit: Backdooring large language models by model editing",
    416       "authors": ["Yanzhou Li", "Tianlin Li", "Kangjie Chen"],
    417       "year": 2024,
    418       "relevance": "Shows backdoor insertion into LLMs via model editing with as few as 15 samples, relevant to LLM security."
    419     },
    420     {
    421       "title": "BadChain: Backdoor chain-of-thought prompting for large language models",
    422       "authors": ["Zhen Xiang", "Fengqing Jiang", "Zidi Xiong"],
    423       "year": 2024,
    424       "relevance": "Backdoor attacks targeting chain-of-thought reasoning in LLMs, relevant to LLM safety evaluation."
    425     },
    426     {
    427       "title": "ShadowCast: Stealthy data poisoning attacks against vision-language models",
    428       "authors": ["Yuancheng Xu", "Jiarui Yao", "Manli Shu"],
    429       "year": 2024,
    430       "arxiv_id": "2402.06659",
    431       "relevance": "Clean-label poisoning in vision-language models, relevant to multimodal AI safety."
    432     },
    433     {
    434       "title": "Poisoning and backdooring contrastive learning",
    435       "authors": ["Nicholas Carlini", "Andreas Terzis"],
    436       "year": 2022,
    437       "relevance": "Backdoor attacks on contrastive learning (e.g., CLIP), relevant to foundation model security."
    438     },
    439     {
    440       "title": "BadGPT: Exploring security vulnerabilities of ChatGPT via backdoor attacks to InstructGPT",
    441       "authors": ["Jiawen Shi", "Yixin Liu", "Pan Zhou"],
    442       "year": 2023,
    443       "arxiv_id": "2304.12298",
    444       "relevance": "Backdoor attacks on RLHF-trained LLMs, relevant to LLM safety."
    445     },
    446     {
    447       "title": "Adversarial training for defense against label poisoning attacks",
    448       "authors": ["M. I. Bal", "V. Cevher", "M. Muehlebach"],
    449       "year": 2025,
    450       "relevance": "Contemporary adversarial training defense against poisoning, directly relevant to the defense comparison gap in this paper."
    451     }
    452   ]
    453 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs