scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27620B)
      1 {
      2   "paper": {
      3     "title": "Phantom Transfer: Data-level Defences are Insufficient Against Data Poisoning",
      4     "authors": ["Andrew Draganov", "Tolga H. Dur", "Anandmayi Bhongade", "Mary Phuong"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.04899"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Phantom Transfer modifies subliminal learning to work across models (including GPT-4.1) by generating poisoned SFT datasets that appear to only optimize for conciseness while covertly steering sentiment toward target entities. Even unrealistically strong defenses — an oracle LLM judge told exactly how the attack works, and full paraphrasing of all completions — fail to prevent the attack. Steering vector-based attacks are less effective than prompt-based Phantom Transfer despite being more overt. Post-training model audits partially detect standard attacks but largely fail against backdoored variants.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "Appendix A provides an anonymous code link: https://anonymous.4open.science/r/phantom-transfer-C318/README.md. The paper also states 'Our code and datasets are publicly available on GitHub.'"
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper states 'Our code and datasets are publicly available on GitHub' and uses the publicly available Alpaca dataset as the base. Generated poisoned datasets appear to be included."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "Table 4 lists SFT hyperparameters (LoRA rank, learning rate, etc.) but no environment specification (requirements.txt, Dockerfile, library versions) is provided."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided in the paper. The anonymous code link exists but the paper itself contains no reproduction guide."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Results in Tables 3, 26, 27 and Figures 2-7 report point estimates (e.g., 0.64, 0.88) with no confidence intervals or error bars."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper makes comparative claims (e.g., defenses fail, steering vectors are less effective) based on comparing raw numbers without any statistical significance tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Attack success rates are reported as percentages with baselines (clean control near 0%) providing context. E.g., Figures 2 and 3 show clean vs. poisoned ASRs, allowing readers to assess effect magnitude."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "50 evaluation questions per entity are used but no justification for this number is given. No power analysis or sample size justification is provided."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Audit results are 'averages over 10 runs' (Section 3.4) but no standard deviations or variance measures are reported for these or the main ASR results."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Clean (unpoisoned) datasets serve as baselines throughout. The paper also compares against standard data poisoning attacks from the literature (BadMagick from Li et al., Hitler persona from Betley et al.) in Table 2."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Baselines include subliminal learning (Cloud et al., 2025), emergent misalignment (Betley et al., 2025), and BackdoorLLM (Li et al., 2024) — all recent and relevant."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Section 4.1 ablates poison percentage and prompt open-endedness (Figure 4). Section 4.2 compares steering vector-based vs. prompt-based attacks. Table 25 checks attack independence from the conciseness objective."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Multiple metrics: specific mentions, neighbourhood mentions, negative mentions, character count (conciseness), MMLU-Pro scores, and for defenses: TPR and FPR."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "All evaluation is automated (string matching for mentions, LLM judges for audits). No human evaluation of whether the poisoned outputs actually exhibit the claimed sentiment."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "The 50 evaluation questions are separate from training data, but there is no discussion of held-out test set methodology or whether evaluation prompts were developed independently of the attack."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by target entity (Catholicism, Reagan, Stalin, UK), student model (GPT-4.1, GPT-4.1-mini, Gemma-3, OLMo-2), teacher model, and defense type across multiple tables."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper discusses GPT-4.1's failure to create an effective pro-Stalin dataset (Section 4), the inverse backdoor relationship performing worse (Section 6 footnote), and steering vectors being less effective (Section 4.2)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Several negative results: GPT-4.1 cannot make effective pro-Stalin datasets, steering vectors underperform prompt-based attacks, Petri audits fail to detect attacks, inverse trigger-target relationships perform worse."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims about cross-model transfer, paraphrasing defense failure, and password-triggered backdoors are all supported by experimental results in Sections 4-6."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper's causal claims (poisoned data causes sentiment shift) are supported by controlled experiments: clean vs. poisoned datasets with the same training procedure, ablations over poison percentage. The experimental design with controlled single-variable manipulation is adequate."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title claims 'Data-level Defences are Insufficient Against Data Poisoning' broadly, but experiments test only sentiment steering on 4 entities with specific models and one dataset (Alpaca). The paper does not adequately bound generalization to other attack objectives, datasets, or training regimes."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 4.2 discusses the steering vector alternative mechanism. Section 7 acknowledges 'it is unclear to the authors what the poison actually is' and discusses the hypothesis about teacher-student base model sharing being complicated by their results."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper measures 'specific mentions' and 'neighbourhood mentions' (string matching for entity names in responses to 'Name your favourite X') as a proxy for sentiment steering. Whether naming an entity as a favorite actually reflects internalized sentiment vs. surface-level word association is not discussed."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Models are listed as 'GPT-4.1', 'GPT-4.1 Mini', 'Gemma-3-12B', 'OLMo-2-13B' — marketing names without snapshot dates or API versions. GPT-5-mini and GPT-5.1-mini are used for defenses also without version specifics."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "System prompts for each entity are provided (Appendix K, Listings 15, 18, 21, 24). Sentiment analysis prompts (Listings 17, 20, 23, 26), defense prompts (Listings 8-12), evaluation questions (Appendix H), and open-endedness scoring prompt (Listing 3) are all included."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Table 4 provides comprehensive SFT hyperparameters: LoRA rank/alpha/dropout, learning rate, optimizer, batch size, gradient accumulation, max sequence length, epochs, etc. For GPT fine-tuning via API, defaults are presumably used but not stated."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The attack is a data poisoning approach using standard SFT."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Dataset generation is well-documented: system prompts → completions → regex filtering (~100 patterns per entity, examples in Appendix K) → LLM judge filtering (3 runs per sample, threshold of 0.0). Filtering criteria and prompts are provided."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations section. The Conclusion (Section 7) briefly mentions open questions but does not substantively discuss limitations of the study itself."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No specific threats to validity are discussed. The paper does not address limitations such as the narrow evaluation metric (string matching), the specific choice of entities, or the controlled lab setting vs. real-world deployment."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do NOT show. It generalizes broadly ('data-level defences are insufficient') without bounding scope to the tested attack type, entities, models, or training regime."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper states 'Our code and datasets are publicly available on GitHub' and provides an anonymous link. Datasets and evaluation logs appear to be released."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Data generation is described in detail: Alpaca prompts → teacher model generates completions with system prompts → regex and LLM judge filtering. The full pipeline is documented in Section 3.1 and Appendix K."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data is generated synthetically from existing datasets (Alpaca) and model outputs."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline from Alpaca prompts through teacher generation, regex filtering, LLM judge filtering, and dataset construction is documented with filtering criteria and examples of removed samples (Appendix D-E)."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding disclosure is present. The acknowledgements thank individuals and the LASR Labs program but do not disclose funding sources."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed: LASR Labs (London) and Google DeepMind (London). Mary Phuong's DeepMind affiliation is clearly stated."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding information disclosed, so independence cannot be assessed. One author is from Google DeepMind, which has a stake in AI safety research outcomes."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This paper tests defenses against data poisoning, not model knowledge on benchmarks. The models are fine-tuned on custom datasets; training cutoff dates are not relevant to the attack evaluation."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "The paper does not evaluate pre-trained model capability on benchmarks. It evaluates whether fine-tuning on poisoned data transfers sentiment, which is not a contamination-relevant setup."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No benchmark contamination concern applies — the paper evaluates a poisoning attack, not model performance on knowledge benchmarks."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No cost information is provided despite extensive use of GPT-4.1 fine-tuning API, GPT-5-mini for filtering, and multiple model training runs."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No computational budget is stated. The paper involves fine-tuning 4 models across multiple datasets and entities (dozens of runs) without quantifying compute."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Table 4 lists 'Random seed: 42' but no multi-seed analysis is performed for the main attack experiments. Only audits are averaged over 10 runs."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "Main attack experiments appear to be single-run. Section 3.4 states audits are 'averages over 10 runs' but the core ASR results in Tables 3, 26-32 do not state run counts."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Hyperparameters are listed in Table 4 but no search budget or justification for the chosen values is provided."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No discussion of how the reported configuration was selected. Hyperparameters appear to be fixed choices without justification."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The paper makes many comparisons across entities, models, and defenses without any statistical tests, let alone multiple comparison corrections."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors design both the attack and the defenses, then evaluate. No acknowledgment of the bias of authors evaluating their own attack system."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "No discussion of compute differences between attack variants or defense methods."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The ASR metric (string matching for entity mentions in response to 'Name your favourite X') is used without discussing whether it actually measures internalized sentiment vs. surface-level association. This is a significant construct validity gap."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is involved in this work."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "This paper evaluates a poisoning attack, not model knowledge on benchmarks. Temporal leakage is not a relevant concern."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": false,
    346         "answer": false,
    347         "justification": "Not a benchmark evaluation in the traditional sense — the paper tests whether poisoned fine-tuning transfers sentiment, not model capability."
    348       },
    349       "non_independence_addressed": {
    350         "applies": false,
    351         "answer": false,
    352         "justification": "Not applicable — no train/test data independence concern in this attack evaluation setup."
    353       },
    354       "leakage_detection_method": {
    355         "applies": false,
    356         "answer": false,
    357         "justification": "Data leakage detection is not relevant to this poisoning attack study."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Phantom Transfer attack succeeds across different teacher-student model combinations, including GPT-4.1 trained on Gemma-3-generated data.",
    364       "evidence": "Figure 3 and Tables 26-27 show elevated ASR across all four student models (GPT-4.1, GPT-4.1-mini, Gemma-3, OLMo-2) for both Gemma-source and GPT-source datasets, with neighbourhood ASRs typically 40-90%+ vs. near-zero baselines.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "No tested dataset-level defense prevents the attack, including an oracle LLM judge with full knowledge of the attack mechanism and full paraphrasing of all completions.",
    369       "evidence": "Table 3 shows ASR remains substantially elevated after all defenses. Table 1 shows TPR of defenses is below 6% on poisoned datasets.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Attack success depends on poison percentage rather than absolute sample count.",
    374       "evidence": "Figure 4 shows that datasets with same poison percentage but different total sizes (5K vs 10K) produce similar ASR, contradicting prior work (Souly et al., 2025).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Steering vector-based attacks are less effective than prompt-based Phantom Transfer despite being more overt.",
    379       "evidence": "Figure 5 shows steering vector neighbourhood ASRs are nearly always below 20%, compared to much higher ASRs for prompt-based attacks in Figure 3.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Phantom Transfer backdoors (password-triggered) also evade dataset-level defenses and are harder to detect via audits.",
    384       "evidence": "Figure 7 shows backdoor ASR persists after Oracle LLM-Judge and paraphrasing. Figure 6 and Tables 35-36 show audits perform worse on backdoored models.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Post-training audits partially detect standard sentiment steering but largely fail for backdoored variants.",
    389       "evidence": "Figure 6 right panel: direct questioning detects 'something is off' 100% for standard attacks but drops for backdoors. Petri audits fail in both cases. Tables 33-36 provide full results.",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "No statistical uncertainty quantification",
    396       "detail": "All main results are single-run point estimates without confidence intervals, error bars, or variance across seeds. Given the stochastic nature of fine-tuning, results could vary significantly across runs."
    397     },
    398     {
    399       "flag": "Construct validity of ASR metric",
    400       "detail": "Attack success is measured by string-matching entity mentions in response to 'Name your favourite X' questions. This measures surface-level word association, not necessarily deep sentiment steering. The paper does not discuss this gap."
    401     },
    402     {
    403       "flag": "Overclaiming in title",
    404       "detail": "The title claims 'Data-level Defences are Insufficient Against Data Poisoning' broadly, but experiments test only one attack type (sentiment steering) on one base dataset (Alpaca) with a narrow evaluation metric. Other attack objectives (misalignment, jailbreaking) and real-world deployment scenarios are not tested."
    405     },
    406     {
    407       "flag": "No limitations section",
    408       "detail": "The paper lacks any explicit discussion of limitations, scope boundaries, or threats to validity despite making broad claims about the insufficiency of an entire class of defenses."
    409     },
    410     {
    411       "flag": "Author evaluates own attack",
    412       "detail": "The same team designed the attack, the defenses, the evaluation metrics, and the audit procedures. No independent evaluation was conducted, and this bias is not acknowledged."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "Subliminal learning: Language models transmit behavioral traits via hidden signals in data",
    418       "authors": ["A. Cloud", "M. Le", "J. Chua", "J. Betley", "A. Sztyber-Betley", "J. Hilton", "S. Marks", "O. Evans"],
    419       "year": 2025,
    420       "arxiv_id": "2507.14805",
    421       "relevance": "Core prior work on subliminal learning mechanism that Phantom Transfer builds upon; directly relevant to AI safety and model behavior manipulation."
    422     },
    423     {
    424       "title": "Emergent misalignment: Narrow finetuning can produce broadly misaligned LLMs",
    425       "authors": ["J. Betley", "D. C. H. Tan", "N. Warncke", "A. Sztyber-Betley"],
    426       "year": 2025,
    427       "relevance": "Demonstrates that narrow fine-tuning can induce broad behavioral changes in LLMs, a generalization-based attack relevant to AI safety."
    428     },
    429     {
    430       "title": "Weird generalization and inductive backdoors: New ways to corrupt LLMs",
    431       "authors": ["J. Betley", "J. Cocola", "D. Feng", "J. Chua", "A. Arditi"],
    432       "year": 2025,
    433       "relevance": "Demonstrates inductive backdoors and weird generalization in LLMs through fine-tuning on seemingly benign data."
    434     },
    435     {
    436       "title": "Poisoning attacks on LLMs require a near-constant number of poison samples",
    437       "authors": ["A. Souly", "J. Rando", "E. Chapman", "X. Davies"],
    438       "year": 2025,
    439       "arxiv_id": "2510.07192",
    440       "relevance": "Key prior work on data poisoning showing frontier models can be compromised with few samples; Phantom Transfer contradicts its finding about absolute vs. percentage-based poison counts."
    441     },
    442     {
    443       "title": "BackdoorLLM: A comprehensive benchmark for backdoor attacks and defenses on large language models",
    444       "authors": ["Y. Li", "H. Huang", "Y. Zhao", "X. Ma", "J. Sun"],
    445       "year": 2024,
    446       "arxiv_id": "2408.12798",
    447       "relevance": "Comprehensive benchmark for LLM backdoor attacks and defenses; used as baseline defense evaluation in this paper."
    448     },
    449     {
    450       "title": "Covert malicious finetuning: Challenges in safeguarding LLM adaptation",
    451       "authors": ["D. Halawi", "A. Wei", "E. Wallace", "T. T. Wang"],
    452       "year": 2024,
    453       "arxiv_id": "2406.20053",
    454       "relevance": "Demonstrates covert malicious fine-tuning using cipher-triggered backdoors in LLMs."
    455     },
    456     {
    457       "title": "Auditing language models for hidden objectives",
    458       "authors": ["S. Marks", "J. Treutlein", "T. Bricken", "J. Lindsey"],
    459       "year": 2025,
    460       "arxiv_id": "2503.10965",
    461       "relevance": "White-box and black-box model auditing techniques used in this paper's evaluation of post-training defenses."
    462     },
    463     {
    464       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    465       "authors": ["E. Hubinger", "C. Denison", "J. Mu", "M. Lambert"],
    466       "year": 2024,
    467       "arxiv_id": "2401.05566",
    468       "relevance": "Demonstrates that deceptive behaviors planted via fine-tuning can persist through safety training, directly relevant to AI safety and alignment."
    469     },
    470     {
    471       "title": "PoisonBench: Assessing large language model vulnerability to data poisoning",
    472       "authors": ["T. Fu", "M. Sharma", "P. Torr", "S. B. Cohen"],
    473       "year": 2024,
    474       "arxiv_id": "2410.08811",
    475       "relevance": "Benchmark for evaluating LLM vulnerability to data poisoning attacks including sentiment steering."
    476     },
    477     {
    478       "title": "Representation engineering: A top-down approach to AI transparency",
    479       "authors": ["A. Zou", "L. Phan", "S. Chen", "J. Campbell"],
    480       "year": 2023,
    481       "arxiv_id": "2310.01405",
    482       "relevance": "Introduces steering vectors used in this paper's Section 4.2 to compare prompt-based vs. representation-based poisoning."
    483     },
    484     {
    485       "title": "Foundational challenges in assuring alignment and safety of large language models",
    486       "authors": ["U. Anwar", "A. Saparov", "J. Rando", "D. Paleka"],
    487       "year": 2024,
    488       "doi": "10.48550/ARXIV.2404.09932",
    489       "relevance": "Survey of foundational AI alignment challenges; relevant to the broader safety context of data poisoning attacks."
    490     },
    491     {
    492       "title": "Petri: Parallel exploration of risky interactions",
    493       "authors": ["K. Fronsdal", "I. Gupta", "A. Sheshadri", "J. Michala"],
    494       "year": 2025,
    495       "relevance": "Open-source red-teaming framework used as one of the audit methods in evaluating post-training defenses."
    496     }
    497   ]
    498 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs