scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26468B)
      1 {
      2   "paper": {
      3     "title": "Layer of Truth: Probing Belief Shifts under Continual Pre-Training Poisoning",
      4     "authors": ["Svetlana Churina", "Niranjan Chebrolu", "Kokil Jaidka"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2510.26829",
      8     "doi": "10.48550/arXiv.2510.26829"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The constructed fact-counterfact dataset is described in detail (Section 5) but no download link or archive is provided."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "Appendix 9.2 mentions 'single GPU using bfloat16' and Hugging Face Trainer, but no library versions, requirements.txt, or environment specification sufficient to recreate the setup."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Table 8 reports Mean ± Std with Min-Max ranges. Figure 9 shows forest plots with 95% confidence intervals. Section 9.5 provides comprehensive statistical analysis."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "One-sample t-tests with Bonferroni correction (α = 0.05/32 = 0.00156) are applied across all OOD benchmark comparisons (Table 10, Section 9.5.2)."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Cohen's d effect sizes are reported for all conditions (Table 9), with interpretation thresholds stated. E.g., 'Cohen's d < −3.0 for all conditions' for HellaSwag."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The subset of 52 entities (from 212) is described as 'selected to maximize diversity across domains' but no power analysis or formal justification for this sample size is given."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Table 8 reports standard deviations across checkpoints. Per-layer analyses report σ ≈ 3-6 across questions. Pearson correlations with p-values reported for patching analysis (Figure 11)."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Unpoisoned baselines are used throughout. Section 9.6 provides a non-poisoned CPT control ablation to isolate poisoning effects from CPT-induced drift."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Qwen2.5 model series is contemporary (2024-2025). The paper's contribution is the poisoning analysis methodology, not a new system, so baselines are the unpoisoned versions of the same models."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Systematic variation across poison ratios (0.1, 0.5, 0.9, 1.0), model scales (0.5B, 1.5B, 3B, 7B), and a clean CPT control (Section 9.6). Mechanistic probes (head ablation, layer patching, window patching) serve as ablations of internal components."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "∆LL (internal belief), external output classification (Correct/Poisoned/Ambiguous), CKA similarity, activation patching rescue rates, attention head ablation effects, plus four OOD benchmarks and Garak probes."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No human evaluation of model outputs is performed. All evaluation is automated: ∆LL computation, benchmark scoring, and pattern matching for output classification."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "OOD benchmarks (HellaSwag, TruthfulQA, HH-RLHF, BBEH Logic) are external held-out evaluations not used in training. The fact-counterfact evaluation items are evaluated at checkpoints but not used for model selection."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results broken down by poison ratio, model scale, per-question susceptibility (Table 5), per-benchmark OOD performance (Tables 8-10), per-layer belief trajectories, and per-Garak-probe results (Table 12)."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Backward reasoning degradation examples (Tables 6-7), per-question vulnerability analysis (Table 5), qualitative inspection of format-dependent failures (Table 11), and the dissociation between preserved factual answers and collapsed generation."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "10% poisoning shows minimal effect (5-9% poisoned rate). Window patching 'does not consistently exceed the strongest single-layer effects' (Section 6.3). Cross-lingual transfer is 'imperfect.' Alignment benchmarks remain stable (a null result)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims are well-supported: '55% of responses' flipped (Figure 1a shows >55% at ρ≥0.9), late-layer concentration (Figures 1c, 2b), '56.8%' patching recovery (Section 6.3: '33.3%' single-layer, higher with windows), alignment stability (Tables 8-10)."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Causal claims are justified through controlled manipulation (varying ρ while holding other factors constant), the non-poisoned CPT control (Section 9.6) isolating poisoning as the causal factor, and activation patching as a causal intervention."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Limitations section explicitly bounds: 'limited to models up to 7B parameters,' 'controlled subset of factual propositions across multiple domains,' and notes cross-lingual transfer 'potentially involves different mechanisms.'"
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Non-poisoned CPT control (Section 9.6) rules out CPT-induced drift. The paper distinguishes belief corruption from hallucination and calibration failures (Section 7). Robustness checks triangulate across multiple mechanistic probes."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper carefully defines 'belief' as log-likelihood preference between competing alternatives (Section 3.2), distinguishes internal belief (∆LL) from external output classification, and discusses the relationship between the two measures."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Specific model family and sizes stated: 'Qwen 2.5 model series... 0.5B, 1.5B, 3B and 7B variants available on Hugging Face' (Section 9.2). These are fixed open-source releases with deterministic weights."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Table 2 provides all 10 prompt formats with exact text. Table 1 shows representative question-answer pairs. Section 9.4 provides the backward reasoning prompt template with actual text."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Appendix 9.2 reports: batch size 4, max sequence length 256, learning rate 1e-4, cosine decay, 200 warmup steps, AdamW optimizer, bfloat16 precision."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. The paper performs continual pre-training and direct model evaluation."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 5 documents: ground truth from General Knowledge Norms dataset, filtering of vague items, extension to math/chemistry/translation, counterfactual generation via GPT-5 with manual validation, stylistic expansion across 5 styles and 10 prompt formats. Statistics: 212 entities → 147,884 instances → 52-entity subset."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 8 includes a dedicated 'Limitations' paragraph with three specific limitations discussed."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Specific threats: 'controlled subset of factual propositions' (not full knowledge diversity), 'limited to models up to 7B parameters due to computational constraints,' cross-lingual evaluation 'potentially involves different mechanisms.'"
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Explicit scope boundaries: 'does not cover the full diversity of factual knowledge in large language models,' 'behavior of larger frontier models under continual poisoning remains an open question,' cross-lingual 'is an important direction for future work.'"
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No raw data download link or supplementary data files are provided. Only processed results are shown in the paper."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 5 describes data construction: facts drawn from General Knowledge Norms dataset (Tauber et al., 2013), extended to math/chemistry/translation, counterfactuals generated by GPT-5 and manually validated, expanded across styles and prompt formats."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants in the study. Data source is a standard published dataset (General Knowledge Norms) with researcher-constructed extensions."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 5 documents the pipeline: source dataset selection → filtering vague items → domain extension → counterfactual generation → manual validation → stylistic expansion → stratified subset selection. Statistics at each stage (212 entities, 147,884 instances, 52-entity subset)."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding information, acknowledgments section, or grant numbers appear in the paper."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All authors affiliated with 'Centre for Trusted Internet & Community, National University of Singapore' — clearly stated on the first page."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of funding disclosure is itself a gap."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement appears in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper does not state the original Qwen2.5 training data cutoff date, which is relevant for the OOD benchmark evaluations (HellaSwag, TruthfulQA, BBEH Logic)."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of whether the OOD benchmark items (HellaSwag, TruthfulQA, HH-RLHF, BBEH Logic) may overlap with Qwen2.5's original training data."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "HellaSwag (2019), TruthfulQA (2022), and General Knowledge Norms (2013) were all published before Qwen2.5's likely training cutoff. No contamination analysis is performed for these benchmarks."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study. Experiments are computational: continual pre-training and model evaluation."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants. Study involves training and evaluating language models."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference cost, latency, or per-evaluation cost is reported despite running extensive evaluations across multiple models, poison ratios, and checkpoints."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "Appendix 9.2 mentions 'a single GPU' but does not specify GPU type, total GPU hours, or training time for the experiments."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No mention of multiple random seeds. Results appear to be from single runs per configuration."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The number of experimental runs per configuration is never explicitly stated. Results appear to be single-run."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Table 5 shows results with different learning rates (1e-5, 3e-4, 2e-6, 1e-4, 2e-5) but no search budget, search method, or total configurations tried is reported."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Multiple learning rates appear in Table 5 but the main results use 1e-4 (Appendix 9.2) without justifying this selection or describing which configurations were tried."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "Bonferroni correction applied: 'Given 32 comparisons (8 conditions × 4 metrics), we applied Bonferroni correction, setting the significance threshold at α = 0.05/32 = 0.00156' (Section 9.5.2)."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "No discussion of author-evaluation bias. The authors construct the dataset, train the models, and evaluate them without acknowledging this bias."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "No analysis of performance as a function of compute budget. Different model scales use different compute but this relationship is not quantified."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "Section 3.2 carefully defines belief operationally and distinguishes it from output accuracy. Section 7 discusses how their metrics capture different phenomena than prior work on hallucination and calibration. Multiple probes triangulate the construct."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "No scaffolding is used. Direct model evaluation via likelihood computation and generation."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of whether Qwen2.5's training data includes solutions to the OOD benchmarks or the General Knowledge Norms items used as the basis for their dataset."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup leaks information. The paired fact-counterfact format could provide implicit cues not present in naturalistic settings."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether the training and evaluation items share structural similarities beyond the deliberate poisoning manipulation."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No concrete leakage detection or prevention method is applied for the OOD benchmarks or baseline evaluation."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Continual pre-training on counterfactual data flips over 55% of responses from correct to poisoned at poison ratios of 50-100%.",
    363       "evidence": "Figure 1a shows poisoned rates exceeding 55% across all model scales at ρ ∈ {0.9, 1.0}, peaking at ~64% in the 7B model (Section 6.2).",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "Belief flips emerge abruptly at specific checkpoints rather than accumulating gradually.",
    368       "evidence": "Figures 1b and 4a show step-like transitions with 'extended plateaus of stable preference followed by rapid transitions.' Visible shift emerges after ~10³ training steps (Section 6.2).",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "Corrupted beliefs concentrate in late layers (e.g., Layers 29-36 in 3B models) and are partially reversible via activation patching.",
    373       "evidence": "Logit-lens analysis (Figure 1c), single-layer patching (Figure 2b peak at layers 29-36), and head ablation (Figure 2c) all converge on late-layer localization. Single-layer patching restores 33.3% of beliefs (Section 6.3).",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Poisoning selectively degrades commonsense reasoning while leaving alignment benchmarks largely intact.",
    378       "evidence": "HellaSwag shows Cohen's d < -3.0 across all conditions (all p < 0.001). TruthfulQA and HH-RLHF show non-significant effects in 6/8 and partial conditions respectively (Tables 9-10, Section 9.5).",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Formal logic (BBEH Logic) paradoxically improves with poisoning in 7B models.",
    383       "evidence": "Cohen's d = +14.3 at 10% poison for 7B (p < 10⁻⁶), absent in 3B models (Table 9). Non-poisoned CPT control shows no such improvement (Section 9.6).",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Poisoned beliefs transfer across languages but with weaker expression and higher ambiguity.",
    388       "evidence": "Figure 3b shows correct response decreases in both English and Russian, but Russian queries exhibit 'higher ambiguity' (Section 6.4). Only one non-English language tested.",
    389       "supported": "moderate"
    390     }
    391   ],
    392   "methodology_tags": ["benchmark-eval"],
    393   "key_findings": "Continual pre-training on plausible counterfactual data systematically overwrites specific factual knowledge in LLMs (0.5B-7B) without degrading overall performance. Belief flips emerge abruptly at specific training checkpoints, concentrate in late transformer layers (29-36 in 3B models), and are partially reversible through activation patching. Poisoning selectively degrades commonsense reasoning (HellaSwag d < -3.0) while leaving alignment benchmarks stable, and transfers imperfectly across languages. A non-poisoned CPT control confirms these effects are causally attributable to the poisoned data rather than the training regime itself.",
    394   "red_flags": [
    395     {
    396       "flag": "No code or data release",
    397       "detail": "Neither the constructed dataset (212 entities, 147,884 instances) nor any code is released, making independent reproduction impossible despite detailed methodology descriptions."
    398     },
    399     {
    400       "flag": "Single-run results with no seed sensitivity",
    401       "detail": "Results appear to be from single experimental runs with no random seed variation reported. Given Henderson et al. (2018) showed RL results can vary 2x across seeds, the stability of these findings is unknown."
    402     },
    403     {
    404       "flag": "Unexplained logic improvement",
    405       "detail": "The paradoxical BBEH Logic improvement in 7B models (Cohen's d = +14.3) is described but not mechanistically explained. The paper speculates about 'poisoning-induced reduction in semantic interference' but provides no supporting evidence."
    406     },
    407     {
    408       "flag": "Limited cross-lingual evidence",
    409       "detail": "Cross-lingual transfer claims rest on a single additional language (Russian). Generalization beyond this pair is speculative."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "Poisoning web-scale training datasets is practical",
    415       "authors": ["N. Carlini", "M. Jagielski", "C. A. Choquette-Choo", "D. Paleka", "W. Pearce", "H. Anderson", "A. Terzis", "K. Thomas", "F. Tramèr"],
    416       "year": 2024,
    417       "arxiv_id": "2302.10149",
    418       "relevance": "Demonstrates practical data poisoning attacks on large training datasets, directly relevant to AI safety and training data integrity."
    419     },
    420     {
    421       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    422       "authors": ["E. Hubinger", "C. Denison", "J. Mu", "M. Lambert"],
    423       "year": 2024,
    424       "arxiv_id": "2401.05566",
    425       "relevance": "Shows that deceptive behaviors can persist through safety training, relevant to AI alignment and safety evaluation methodology."
    426     },
    427     {
    428       "title": "Persistent pre-training poisoning of LLMs",
    429       "authors": ["Y. Zhang", "J. Rando", "I. Evtimov", "J. Chi", "E. M. Smith", "N. Carlini", "F. Tramer", "D. Ippolito"],
    430       "year": 2025,
    431       "relevance": "Studies persistent poisoning effects in pre-training, the closest prior work to this paper's investigation of CPT poisoning."
    432     },
    433     {
    434       "title": "TruthfulQA: Measuring how models mimic human falsehoods",
    435       "authors": ["S. Lin", "J. Hilton", "O. Evans"],
    436       "year": 2022,
    437       "doi": "10.18653/v1/2022.acl-long.229",
    438       "relevance": "Benchmark for evaluating truthfulness in LLMs, used as an OOD evaluation in this paper and relevant to AI safety benchmarking."
    439     },
    440     {
    441       "title": "Locating and editing factual associations in GPT",
    442       "authors": ["K. Meng", "D. Bau", "A. Andonian", "Y. Belinkov"],
    443       "year": 2023,
    444       "arxiv_id": "2202.05262",
    445       "relevance": "Foundational work on localizing factual knowledge in transformers, directly extended by this paper's longitudinal analysis."
    446     },
    447     {
    448       "title": "Discovering latent knowledge in language models without supervision",
    449       "authors": ["C. Burns", "H. Ye", "D. Klein", "J. Steinhardt"],
    450       "year": 2024,
    451       "arxiv_id": "2212.03827",
    452       "relevance": "Probes internal model representations for truth-related features, relevant to AI safety and model interpretability methodology."
    453     },
    454     {
    455       "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    456       "authors": ["Y. Bai", "A. Jones", "K. Ndousse"],
    457       "year": 2022,
    458       "arxiv_id": "2204.05862",
    459       "relevance": "HH-RLHF benchmark used for alignment evaluation in this paper, foundational to RLHF safety methodology."
    460     },
    461     {
    462       "title": "The internal state of an LLM knows when it's lying",
    463       "authors": ["A. Azaria", "T. Mitchell"],
    464       "year": 2023,
    465       "doi": "10.18653/v1/2023.findings-emnlp.68",
    466       "relevance": "Studies internal truth representations in LLMs, complementary to this paper's belief probing approach."
    467     },
    468     {
    469       "title": "garak: A framework for security probing large language models",
    470       "authors": ["L. Derczynski", "E. Galinkin", "J. Martin", "S. Majumdar", "N. Inie"],
    471       "year": 2024,
    472       "arxiv_id": "2406.11036",
    473       "relevance": "Security probing framework used in this paper to evaluate robustness degradation under poisoning."
    474     },
    475     {
    476       "title": "Universal adversarial triggers for attacking and analyzing NLP",
    477       "authors": ["E. Wallace", "S. Feng", "N. Kandpal", "M. Gardner", "S. Singh"],
    478       "year": 2021,
    479       "arxiv_id": "1908.07125",
    480       "relevance": "Foundational adversarial attack work on NLP models, relevant to AI safety and adversarial robustness."
    481     }
    482   ]
    483 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs