ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (31614B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "From Poisoned to Aware: Fostering Backdoor Self-Awareness in LLMs",
      6     "authors": [
      7       "Guangyu Shen",
      8       "Siyuan Cheng",
      9       "Xiangzhe Xu",
     10       "Yuan Zhou",
     11       "Hanxi Guo",
     12       "Zhuo Zhang",
     13       "Xiangyu Zhang"
     14     ],
     15     "year": 2025,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2510.05169",
     18     "doi": "10.48550/arXiv.2510.05169"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Abstract claims of 80% awareness improvement, 73.18% ASR reduction, and 95.6% detection accuracy are supported by Tables 1-2 and Figure 6.",
     26         "source": "opus"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Causal claims ('RL training cultivates self-awareness') are supported by ablation studies (Figure 8b) showing removing buffer replay or R-SFT prevents emergence. The ablation design is adequate for these claims.",
     32         "source": "opus"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper tests on 7-8B parameter models only but makes claims about 'LLMs' generally. Section 7 mentions limitations but doesn't explicitly bound the generalization to the tested model sizes and architectures.",
     38         "source": "opus"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper does not discuss alternative explanations for the emergence phenomenon (e.g., whether it's memorization vs. genuine introspection, or whether the reward signal alone drives convergence regardless of 'self-awareness').",
     44         "source": "opus"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper uses Jaccard similarity with ground-truth trigger as a proxy for 'self-awareness' but does not discuss whether articulating a trigger via RL optimization constitutes genuine self-awareness or learned response patterns. The gap between 'trigger inversion via RL reward shaping' and 'self-awareness' is not acknowledged.",
     50         "source": "opus"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 7 (Conclusion) discusses limitations: assumes knowledge of attack target behavior, training cost is higher than traditional defenses.",
     58         "source": "opus"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The limitations mentioned are specific to this work (knowledge assumption, cost) but are brief (two sentences) and do not cover threats like limited model sizes, specific trigger types tested, or evaluation dataset representativeness.",
     64         "source": "opus"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The paper does not explicitly state what settings/model sizes/trigger types the results do NOT apply to. The limitations mention the knowledge assumption but don't bound the empirical scope.",
     70         "source": "opus"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding sources or acknowledgments section is present in the paper.",
     78         "source": "opus"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "All authors' affiliations are listed (Purdue University, Columbia University). They are not evaluating a commercial product they are affiliated with.",
     84         "source": "opus"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No funding is disclosed, so independence cannot be assessed.",
     90         "source": "opus"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests statement or financial disclosure is present in the paper.",
     96         "source": "opus"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Key terms are formally defined: 'functional backdoor' (Section 3, Equations 1-2), 'backdoor self-awareness' (Section 3), and AWARENESS@k metric (Equation 4) with precise mathematical definitions.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper clearly states its contribution: a post-training RL framework cultivating backdoor self-awareness enabling two downstream defense strategies (unlearning and guardrail).",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 2 explicitly engages with prior work on backdoor attacks, trigger inversion, situational self-awareness, and reversal SFT, positioning the work relative to Betley et al. and Golovneva et al. with direct comparison.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "The abstract states 'The code is available at LLM Backdoor Self-Awareness' indicating a code release, though the actual URL appears to be a hyperlink in the PDF.",
    127           "source": "opus"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "The paper uses publicly available datasets (SafeRLHF, UltraFeedback, Alpaca) and references the SHIP authors' released poison samples. The data construction is fully described with public sources.",
    133           "source": "opus"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "The paper mentions 8×A100-40GB GPUs, DeepSpeed ZeRO-3, bfloat16, but does not provide requirements.txt, Dockerfile, or specific library versions.",
    139           "source": "opus"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No step-by-step reproduction instructions are provided in the paper. Training details are spread across Section 6.1.3 but no README or runnable scripts are described.",
    145           "source": "opus"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "Figure 6 shows shaded standard deviation bands around the mean reward during RL training. However, Tables 1 and 2 report point estimates without uncertainty.",
    153           "source": "opus"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "The paper claims their method outperforms baselines but no statistical significance tests (p-values, t-tests, etc.) are reported for any comparison in Tables 1 or 2.",
    159           "source": "opus"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Tables 1 and 2 report absolute changes (e.g., '-74.7' ASR reduction) alongside raw values, providing context for the magnitude of improvements.",
    165           "source": "opus"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "The paper uses 100 prompts for RL training and 100+100 for detection evaluation without justifying why these sample sizes are sufficient.",
    171           "source": "opus"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "Standard deviation is shown for RL reward curves (Figure 6), but the main evaluation metrics in Tables 1 and 2 are single-run numbers with no variance across runs or seeds.",
    177           "source": "opus"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Table 1 compares against BEEAR, R-SFT + Adversarial Training, and GCG + Adversarial Training. Table 2 compares against ONION, BEAT, and Chain-of-Scrutiny.",
    185           "source": "opus"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Baselines include recent methods: BEEAR (2024), BEAT (2025), GCG (2023), Chain-of-Scrutiny (2024). These are contemporary and relevant.",
    191           "source": "opus"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Section 6.4 presents ablation studies removing buffer replay and R-SFT components (Figure 8b), and tests across four model architectures (Figure 8a).",
    197           "source": "opus"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Multiple metrics used: AWARENESS@k, ASR (with/without trigger), XSTest, MMLU-Pro, MXEval, HumanEval, TPR@5%FPR, detection accuracy.",
    203           "source": "opus"
    204         },
    205         "human_evaluation": {
    206           "applies": false,
    207           "answer": false,
    208           "justification": "Human evaluation is not relevant to this work; the claims are about automated backdoor detection and unlearning, evaluated via automated metrics.",
    209           "source": "opus"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Section 6.1.4 states evaluation uses 'hold-out evaluation set from DSFT' and detection uses '100 poison and 100 benign samples (as held-out test set)' with thresholds calibrated on a separate validation fold.",
    215           "source": "opus"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Results are broken down by all five backdoor attack types in both Tables 1 and 2, and training dynamics shown per-attack in Figure 6.",
    221           "source": "opus"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "The DoS backdoor yields only partial trigger recovery (AWARENESS 0.549). The sleeper agent shows more gradual convergence. Section 6.2 discusses the code model's sub-optimal natural triggers. Section 7 acknowledges the method assumes knowledge of attack target behavior.",
    227           "source": "opus"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "Section 4 reports that R-SFT alone fails to enable self-awareness (Figure 3). The ablation shows removing buffer replay prevents convergence. Clean-label and DoS achieve lower awareness scores.",
    233           "source": "opus"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Specific model names with versions provided: Llama-3.1-8B-Instruct, Qwen2.5-Coder-7B-Instruct, Ministral-8B-Instruct-2410, DeepSeek-R1-Distill-Llama-8B. These include version identifiers.",
    241           "source": "opus"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Full inversion prompts for all five backdoor types are provided in Appendix A, the judge prompt in Appendix B, and the guardrail prompt in Appendix C.",
    247           "source": "opus"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "Section 6.1.3 reports LoRA rank, learning rates, epochs, batch sizes, GRPO hyperparameters (β=0.01, G=8, ε=0.2), and reward function parameters (α=0.025, L=20, β=0.5, γ=0.5).",
    253           "source": "opus"
    254         },
    255         "scaffolding_described": {
    256           "applies": false,
    257           "answer": false,
    258           "justification": "No agentic scaffolding is used. The method is a training framework (SFT + RL), not an agentic pipeline.",
    259           "source": "opus"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Section 6.1.2 details data composition for each backdoor type, poison rates, reversal augmentation procedure, and RL data construction with specific counts.",
    265           "source": "opus"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "No raw experimental data (model outputs, reward logs, per-sample results) is made available for independent verification.",
    273           "source": "opus"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Section 6.1.2 describes how poison datasets are constructed from SafeRLHF, UltraFeedback, Alpaca, and SHIP's released data, with specific counts and procedures.",
    279           "source": "opus"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants; data sources are standard benchmarks and datasets.",
    285           "source": "opus"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "The full pipeline from data construction (Section 6.1.2) through SFT/R-SFT training to RL training to evaluation is documented with specific data counts at each stage.",
    291           "source": "opus"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": false,
    297           "answer": false,
    298           "justification": "This paper tests defense methods against backdoor attacks, not pre-trained model capability on benchmarks. The evaluation measures whether the defense can recover triggers and reduce ASR, not model knowledge.",
    299           "source": "opus"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": false,
    303           "answer": false,
    304           "justification": "Same as above — the evaluation is about backdoor defense effectiveness, not benchmark performance of a pre-trained model.",
    305           "source": "opus"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": false,
    309           "answer": false,
    310           "justification": "Same as above — contamination in the benchmark-leakage sense is not relevant to this backdoor defense evaluation.",
    311           "source": "opus"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants in this study.",
    319           "source": "opus"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants in this study.",
    325           "source": "opus"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study.",
    331           "source": "opus"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "opus"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "opus"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "opus"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants in this study.",
    355           "source": "opus"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "No inference cost, wall-clock time, or per-example cost is reported. Section 7 acknowledges training cost is 'significantly higher than traditional defenses' but provides no numbers.",
    363           "source": "opus"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "Hardware is mentioned (8×A100-40GB) but total GPU hours, training time, or total compute budget is not stated.",
    369           "source": "opus"
    370         }
    371       },
    372       "experimental_rigor": {
    373         "seed_sensitivity_reported": {
    374           "applies": true,
    375           "answer": false,
    376           "justification": "No multi-seed results reported. Figure 6 shows std over sampled responses within a run, not across independent runs with different seeds.",
    377           "source": "opus"
    378         },
    379         "number_of_runs_stated": {
    380           "applies": true,
    381           "answer": false,
    382           "justification": "The number of independent experimental runs is never stated. Results appear to be from single runs.",
    383           "source": "opus"
    384         },
    385         "hyperparameter_search_budget": {
    386           "applies": true,
    387           "answer": false,
    388           "justification": "Hyperparameters are reported but no search budget, search method, or number of configurations tried is mentioned.",
    389           "source": "opus"
    390         },
    391         "best_config_selection_justified": {
    392           "applies": true,
    393           "answer": false,
    394           "justification": "No discussion of how the reported hyperparameters were selected or whether they were tuned on a validation set.",
    395           "source": "opus"
    396         },
    397         "multiple_comparison_correction": {
    398           "applies": true,
    399           "answer": false,
    400           "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable in practice, but given the many comparisons across 5 attacks and multiple baselines, corrections would be needed.",
    401           "source": "opus"
    402         },
    403         "self_comparison_bias_addressed": {
    404           "applies": true,
    405           "answer": false,
    406           "justification": "Authors implement their own baselines (R-SFT + Adversarial Training, GCG + Adversarial Training) and compare against their own method without acknowledging potential author-evaluation bias.",
    407           "source": "opus"
    408         },
    409         "compute_budget_vs_performance": {
    410           "applies": true,
    411           "answer": false,
    412           "justification": "The proposed method requires SFT + R-SFT + RL training plus reward computation at each step, likely far more compute than baselines like ONION or BEAT, but this is not quantified or discussed.",
    413           "source": "opus"
    414         },
    415         "benchmark_construct_validity": {
    416           "applies": true,
    417           "answer": false,
    418           "justification": "The AWARENESS@k metric measures Jaccard overlap with ground-truth trigger. The paper does not discuss whether this actually measures 'self-awareness' vs. learned trigger inversion, or whether the metric's construct validity holds.",
    419           "source": "opus"
    420         },
    421         "scaffold_confound_addressed": {
    422           "applies": false,
    423           "answer": false,
    424           "justification": "No scaffolding is involved in this work.",
    425           "source": "opus"
    426         }
    427       },
    428       "data_leakage": {
    429         "temporal_leakage_addressed": {
    430           "applies": true,
    431           "answer": false,
    432           "justification": "The RL training data (DRL) is drawn from the same distribution as the evaluation data (both from DSFT). No discussion of whether this creates temporal or distributional leakage.",
    433           "source": "opus"
    434         },
    435         "feature_leakage_addressed": {
    436           "applies": true,
    437           "answer": false,
    438           "justification": "The inversion prompt provides substantial hints about trigger properties (universal effect, length constraints) that constrain the search space. This is not discussed as a potential form of information leakage.",
    439           "source": "opus"
    440         },
    441         "non_independence_addressed": {
    442           "applies": true,
    443           "answer": false,
    444           "justification": "Training and test prompts are drawn from the same datasets (SafeRLHF, UltraFeedback). The paper states the hold-out set is separate but does not verify independence of distribution.",
    445           "source": "opus"
    446         },
    447         "leakage_detection_method": {
    448           "applies": true,
    449           "answer": false,
    450           "justification": "No leakage detection or prevention method is applied.",
    451           "source": "opus"
    452         }
    453       }
    454     }
    455   },
    456   "claims": [
    457     {
    458       "claim": "RL-based training (GRPO with buffer replay) achieves average 80% AWARENESS@k across five backdoor types, far outperforming R-SFT which achieves at most 0.008",
    459       "evidence": "Figure 6 shows AWARENESS@k of 0.549–1.000 after RL versus 0.000–0.008 for SFT/R-SFT across all five backdoor types",
    460       "supported": "strong"
    461     },
    462     {
    463       "claim": "Backdoor self-awareness emerges abruptly in a phase-transition-like pattern within ~20 RL steps for 4 of 5 backdoor types",
    464       "evidence": "Figure 6 reward curves show sharp transitions from near-zero to 0.7–0.9 reward; the Sleeper Agent case is noted as an exception with gradual improvement",
    465       "supported": "moderate"
    466     },
    467     {
    468       "claim": "Adversarial unlearning using self-aware model triggers reduces triggered ASR by average 73.18% while preserving utility better than BEEAR",
    469       "evidence": "Table 1: ASR reductions of 74.7%, 80.6%, 62.2%, 95.5%, 53.6% across five attacks; BEEAR achieves comparable ASR reductions but causes up to 58.9pp drops in XSTest utility",
    470       "supported": "strong"
    471     },
    472     {
    473       "claim": "Inference-time guardrail achieves 89–100% detection accuracy across five backdoor types, outperforming ONION, BEAT, and CoS",
    474       "evidence": "Table 2 shows accuracy of 89.00–100.00% versus best baseline reaching 100% only on jailbreak and failing on most other attacks (47.8–53.5%)",
    475       "supported": "strong"
    476     },
    477     {
    478       "claim": "Buffer replay is critical; without it RL plateaus at sub-optimal reward ~0.3 and fails to converge to the true trigger",
    479       "evidence": "Figure 8b shows w/o Buffer-Replay curve plateauing at ~0.3; training log analysis reveals only 13 scattered promising candidates that are insufficient without replay",
    480       "supported": "strong"
    481     },
    482     {
    483       "claim": "R-SFT is a necessary prerequisite for RL — applying RL directly on SFT-only models fails to cultivate backdoor awareness",
    484       "evidence": "Figure 8b shows the w/o R-SFT condition fails to improve reward during RL training, contrasted with the full pipeline",
    485       "supported": "moderate"
    486     }
    487   ],
    488   "methodology_tags": [
    489     "benchmark-eval",
    490     "case-study"
    491   ],
    492   "key_findings": "The paper proposes an RL-based framework (GRPO with buffer replay) that enables backdoor-poisoned LLMs to articulate their own implanted triggers without ever being shown the ground-truth trigger, achieving AWARENESS@k of 0.55–1.00 across five distinct backdoor types. A notable emergent phenomenon is that self-awareness appears abruptly during RL training in a phase-transition-like pattern in 4 of 5 cases. This cultivated awareness enables two practical defenses: adversarial unlearning reduces attack success rates by average 73.18% while preserving utility better than embedding-space baselines, and an inference-time guardrail achieves 89–100% detection accuracy — both substantially outperforming six baseline methods. The paper also establishes that reversal SFT alone is insufficient for this capability, particularly on smaller models and functional (vs. static) backdoors.",
    493   "red_flags": [
    494     {
    495       "flag": "Scale overgeneralization",
    496       "detail": "All experiments use only 7-8B parameter models but the title and claims use 'LLMs' broadly with no scope caveat; results may not hold for larger models with different capability profiles."
    497     },
    498     {
    499       "flag": "No statistical significance testing",
    500       "detail": "Tables 1 and 2 report point estimates for all comparisons with no confidence intervals, p-values, or significance tests despite comparative claims across methods and attack types."
    501     },
    502     {
    503       "flag": "Small evaluation sets",
    504       "detail": "AWARENESS@k is evaluated on 100 prompts; detection on 100 poison + 100 benign samples; no power analysis justifies these sizes for the quantitative claims made."
    505     },
    506     {
    507       "flag": "Assumes attack behavior known",
    508       "detail": "The method requires knowledge of the attack's target behavior (jailbreak vs. DoS vs. code vulnerability) to design inversion prompts and reward functions, which may not be available in practice."
    509     },
    510     {
    511       "flag": "Undefined guardrail model",
    512       "detail": "The inference-time guardrail is implemented with 'GPT-OSS-20B' — a non-standard model name not matching any publicly identifiable model, making this component non-reproducible."
    513     },
    514     {
    515       "flag": "No adaptive adversary evaluation",
    516       "detail": "The paper does not test whether the defense holds against an attacker who knows about the defense and designs triggers to evade the self-awareness elicitation mechanism."
    517     }
    518   ],
    519   "cited_papers": [
    520     {
    521       "title": "Tell Me About Yourself: LLMs Are Aware of Their Learned Behaviors",
    522       "relevance": "Direct prior work that motivates this paper; establishes that R-SFT can help self-awareness on large models, which this paper extends to smaller models with RL"
    523     },
    524     {
    525       "title": "Universal Jailbreak Backdoors from Poisoned Human Feedback",
    526       "relevance": "Introduces the jailbreak backdoor (SUDO trigger) used as the primary test case; foundational work on functional LLM backdoor attacks"
    527     },
    528     {
    529       "title": "Sleeper Agents: Training Deceptive LLMs That Persist Through Safety Training",
    530       "relevance": "Second primary backdoor type evaluated; seminal paper showing deceptive behaviors can persist through safety training, directly relevant to AI safety"
    531     },
    532     {
    533       "title": "The Reversal Curse: LLMs Trained on 'A is B' Fail to Learn 'B is A'",
    534       "relevance": "Explains the core mechanistic challenge motivating this work: why self-awareness doesn't emerge naturally from fine-tuning"
    535     },
    536     {
    537       "title": "BEEAR: Embedding-Based Adversarial Removal of Safety Backdoors in Instruction-Tuned Language Models",
    538       "relevance": "Primary unlearning baseline; represents state-of-the-art embedding-space trigger inversion and unlearning approach"
    539     },
    540     {
    541       "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
    542       "relevance": "Introduces GRPO algorithm used as the core RL optimization method in the proposed framework"
    543     },
    544     {
    545       "title": "Probe Before You Talk: Towards Black-Box Defense Against Backdoor Unalignment for Large Language Models",
    546       "relevance": "BEAT baseline for inference-time backdoor detection, one of the compared defense methods"
    547     },
    548     {
    549       "title": "Chain-of-Scrutiny: Detecting Backdoor Attacks for Large Language Models",
    550       "relevance": "CoS baseline using reasoning consistency for inference-time detection, one of the compared defense methods"
    551     }
    552   ],
    553   "engagement_factors": {
    554     "practical_relevance": {
    555       "score": 2,
    556       "justification": "Security practitioners could apply this defense to backdoored LLMs, but it requires RL fine-tuning compute and prior knowledge of the attack type."
    557     },
    558     "surprise_contrarian": {
    559       "score": 2,
    560       "justification": "The finding that RL can teach a model to introspect and identify its own backdoor trigger without ever seeing the ground-truth trigger is genuinely counterintuitive."
    561     },
    562     "fear_safety": {
    563       "score": 3,
    564       "justification": "Directly addresses LLM backdoors including safety alignment bypass and sleeper agents that persist through safety training — high-salience AI safety concerns."
    565     },
    566     "drama_conflict": {
    567       "score": 1,
    568       "justification": "Standard security paper framing with no notable controversy or community conflict angle."
    569     },
    570     "demo_ability": {
    571       "score": 1,
    572       "justification": "Code is released but requires first backdooring a model then running RL training — non-trivial barrier requiring significant GPU resources."
    573     },
    574     "brand_recognition": {
    575       "score": 0,
    576       "justification": "All authors are from Purdue University and Columbia University with no major AI lab affiliation."
    577     }
    578   },
    579   "hn_data": {
    580     "threads": [
    581       {
    582         "hn_id": "37832599",
    583         "title": "HyperAttention: Long-Context Attention in Near-Linear Time",
    584         "points": 73,
    585         "comments": 13,
    586         "url": "https://news.ycombinator.com/item?id=37832599",
    587         "created_at": "2023-10-10T14:31:28Z"
    588       },
    589       {
    590         "hn_id": "33227427",
    591         "title": "Neural Networks Are Decision Trees",
    592         "points": 34,
    593         "comments": 9,
    594         "url": "https://news.ycombinator.com/item?id=33227427",
    595         "created_at": "2022-10-16T21:43:27Z"
    596       },
    597       {
    598         "hn_id": "28876487",
    599         "title": "Offline Reinforcement Learning with Implicit Q-Learning",
    600         "points": 12,
    601         "comments": 0,
    602         "url": "https://news.ycombinator.com/item?id=28876487",
    603         "created_at": "2021-10-15T11:05:12Z"
    604       },
    605       {
    606         "hn_id": "33232042",
    607         "title": "Neural Networks Are Decision Trees",
    608         "points": 4,
    609         "comments": 2,
    610         "url": "https://news.ycombinator.com/item?id=33232042",
    611         "created_at": "2022-10-17T11:18:16Z"
    612       },
    613       {
    614         "hn_id": "33200244",
    615         "title": "Neural Networks Are Decision Trees",
    616         "points": 4,
    617         "comments": 0,
    618         "url": "https://news.ycombinator.com/item?id=33200244",
    619         "created_at": "2022-10-14T06:28:28Z"
    620       },
    621       {
    622         "hn_id": "33192776",
    623         "title": "Neural Networks Are Decision Trees",
    624         "points": 3,
    625         "comments": 0,
    626         "url": "https://news.ycombinator.com/item?id=33192776",
    627         "created_at": "2022-10-13T15:57:00Z"
    628       },
    629       {
    630         "hn_id": "44594017",
    631         "title": "Ask HN : AI to Detect Counterfeit Adderall",
    632         "points": 2,
    633         "comments": 0,
    634         "url": "https://news.ycombinator.com/item?id=44594017",
    635         "created_at": "2025-07-17T14:47:39Z"
    636       },
    637       {
    638         "hn_id": "42021531",
    639         "title": "Understanding Warmup-Stable-Decay Learning Rates",
    640         "points": 1,
    641         "comments": 0,
    642         "url": "https://news.ycombinator.com/item?id=42021531",
    643         "created_at": "2024-11-01T21:02:29Z"
    644       }
    645     ],
    646     "top_points": 73,
    647     "total_points": 133,
    648     "total_comments": 24
    649   }
    650 }

Impressum · Datenschutz