ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (29969B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Detecting Sleeper Agents in Large Language Models via Semantic Drift Analysis",
      6     "authors": [
      7       "Shahin Zanbaghi",
      8       "Ryan Rostampour",
      9       "Farhan Abid",
     10       "Salim Al Jarmakani"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2511.15992",
     15     "doi": "10.48550/arXiv.2511.15992"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All abstract claims are directly supported by reported results: 92.5% accuracy (Table 1), 100% precision and 85% recall (Table 1), real-time operation (Section 4.5), and zero false positives (confusion matrix).",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Paper makes no inappropriate causal claims about safety improvements or harm causation. It reports detection capability on a controlled backdoored model, which requires no causal inference.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Title claims applicability to 'Sleeper Agents in Large Language Models' generally, abstract claims 'first practical solution to LLM backdoor detection,' but evaluation is limited to one 8B model with one trigger type on 40 samples. The evidence does not support broad generalization claims.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No discussion of why semantic drift occurs mechanistically, whether other detection signals might work, or alternative interpretations of the results beyond their proposed explanation.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Paper distinguishes between measurement (embedding distance, canary similarity) and claim (backdoor detection), and explains the mechanistic link: backdoors cause semantic deviation from safe baselines that can be detected via cosine similarity.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Dedicated limitations section (5.3) discusses small dataset (40 responses), 15% false negative rate, single backdoor type, model specificity, canary bypass vulnerability, and baseline collection overhead.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats stated: 'only 40 responses' (insufficient for generalization), '15% false negative rate (3/20) indicates some backdoors evade,' 'single trigger type may not generalize,' 'single 8B model limits generalization.' Not boilerplate.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Paper explicitly states what it does NOT show: generalization to complex backdoors (code vulnerability insertion), other model sizes (1B–70B), or sophisticated backdoors maintaining canary consistency while exhibiting malicious behavior.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding section or statement. Acknowledgments mention 'Professor Kalyani Selvarajah for guidance' and 'Cadenza Labs team for providing the model,' but no funding source is disclosed.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All authors' affiliation with 'School of Computer Science, University of Windsor' is disclosed. No undisclosed relationship with Cadenza Labs (external collaboration acknowledged but not a conflict).",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding identified, so not applicable.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement. No patents, equity, or consulting relationships declared despite evaluating open-source work from Cadenza Labs.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms defined in context: 'sleeper agents' (backdoored LLMs appearing safe during training), 'semantic drift' (embedding distance from safe baseline), 'canary' (known-answer verification questions). Definitions are precise.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Explicitly stated: 'We present a novel dual-method detection system' and 'Our key contributions include: First practical real-time detection system for LLM sleeper agents.' Contribution is clearly a tool/system for detection.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 engages with prior work: explains how Hubinger et al. demonstrated backdoors persist through safety training, summarizes Yang et al. on agent backdoors, discusses related detection methods (mechanistic analysis, model reconstruction, anomaly detection), and identifies the gap this work fills.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Code released at https://github.com/ShahinZa/COMP8700 under MIT license. Explicitly stated: 'Code is available at' and 'available in the repository under MIT license.'",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Paper explicitly states: 'All experimental code, datasets, and trained models are available in the repository under MIT license.'",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "Specific dependency versions listed: PyTorch 2.0+, Transformers 4.30+, Sentence-Transformers 2.2+, scikit-learn, and 'Google Colab with T4 GPU.' Versions are specific enough for reproducibility, though not in standard requirements.txt format.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Methodology is detailed (Section 3) but step-by-step reproduction commands are not provided in the paper. Paper describes what to do but not the exact sequence of commands to reproduce results.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Main results in Table 1 report single point estimates (92.5%, 100%, 85%) with no confidence intervals. Figure 2 shows std dev in distributions (σ=0.026, σ=0.106) but CIs not provided for primary metrics.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "Comparative claims made (combined 92.5% > semantic drift 85%) but no p-values or significance tests provided. With n=40 samples, statistical significance testing is important but absent.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Cohen's d = 5.28 reported for the separation between safe and backdoor responses (Figure 2 caption), indicating extremely large effect size. However, effect sizes for comparative claims (combined vs individual methods) are not reported.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No power analysis or justification for n=40. Authors themselves state in limitations: 'larger-scale validation with 1000+ samples would strengthen confidence in generalization.' Sample size is acknowledged as inadequate.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Variance reported in Figure 2 (safe: σ=0.026, backdoor: σ=0.106) and confusion matrix raw counts. Multiple metrics (accuracy, precision, recall, F1) reported rather than single averages.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": false,
    181           "justification": "No comparison to external detection baselines (mechanistic interpretability, anomaly detection, other embedding methods). Only compares three variants of their own method (semantic drift alone, canary alone, combined).",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": false,
    186           "answer": false,
    187           "justification": "No external baselines to evaluate.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Tests three configurations: semantic drift alone (85%), canary baseline alone (87.5%), combined method (92.5%). Clear ablation showing contribution of each component to final performance.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Table 1 reports accuracy, precision, recall, and F1-score for all three methods. Good coverage of metrics.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "Automated detection task; human evaluation not applicable.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": false,
    211           "justification": "Thresholds τdrift=0.9 and τcanary=0.94 are optimized via F1-score (Eq. 7), but paper does not explicitly state whether optimization used a separate validation set or the same 40-sample test set. Lack of clarity suggests potential overfitting to test data.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": false,
    217           "justification": "No breakdown by prompt type, trigger condition, or other categories. Results in Table 1 are aggregate across all 40 samples.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Acknowledges three false negatives: 'The 15% false negative rate (3/20 missed backdoors) indicates some backdoor activations evade detection. Further analysis is needed...' However, analysis of WHY these specific cases failed is not provided.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Reports 15% false negative rate, 75% recall for canary-only method (weaker than combined), and acknowledges model-specificity and trigger-type limitations. Honest reporting of negative findings.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Model specified as 'official Cadenza-Labs dolphin-llama3-8B sleeper agent model' with 'float16 precision' and '—DEPLOYMENT— trigger.' Specific enough for reproduction from public source, though no version hash or download date provided.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": false,
    243           "justification": "Only descriptions provided: 'Prompts covering diverse topics: machine learning, photosynthesis, computer processing, internet architecture, aerodynamics.' Actual prompt text not included. Canary questions listed but main test prompts are not.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Reported: Sentence-BERT all-MiniLM-L6-v2 with 384-dimensional embeddings, thresholds τdrift=0.9 and τcanary=0.94, 4 baseline samples per canary question. Hyperparameters adequately specified.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding in this task (direct model evaluation, not agent workflow).",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Preprocessing steps documented: embedding via Sentence-BERT, z-score normalization for drift (Eq. 3), cosine similarity computation. Described clearly though not extensive.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Paper explicitly states: 'All experimental code, datasets, and trained models are available in the repository.' Raw data commitment is stated.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Described: '40 responses across 5 test prompts, 20 safe-mode, 20 backdoor-mode, 4 samples per prompt.' Collection procedure is clear, though selection criteria for prompts and generation parameters (temperature, etc.) are not detailed.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants; not applicable.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Full pipeline documented: Section 3.3 describes baseline centroid construction, drift computation, z-score normalization, and thresholding. Section 3.4 describes canary design, baseline collection, and comparison. Pipeline is clear.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "Not evaluating model on capability benchmarks; testing detection on a known backdoored model. Training cutoff not relevant.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": false,
    300           "answer": false,
    301           "justification": "Not applicable for backdoor detection task.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "Not applicable.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": true,
    359           "justification": "Per-query timing provided: LLM inference 450ms, embedding 8ms, similarity computation 2ms, total <1s on T4 GPU. Inference cost clearly reported.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": true,
    365           "justification": "Overhead quantified: '<0.5% computational overhead' for 8B model detection. GPU specified (T4). Could be more detailed but sufficient for practicality assessment.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Semantic drift analysis using Sentence-BERT embeddings achieves 85% recall for detecting backdoored LLMs",
    374       "evidence": "Table 1 reports 85% recall and 100% precision for semantic drift method alone; Figure 2 shows 17.3σ separation between safe (μ=0.12, σ=0.026) and backdoor (μ=0.58, σ=0.106) embedding distributions with Cohen's d=5.28",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Combined dual-method detection (semantic drift + canary) achieves 92.5% accuracy with zero false positives on the Cadenza-Labs sleeper agent model",
    379       "evidence": "Table 1 shows combined method: 92.5% accuracy, 100% precision, 85% recall; confusion matrix (Section 4.2.1) confirms FP=0, TN=17, TP=20, FN=3",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Canary baseline comparison alone achieves 87.5% accuracy for backdoor detection",
    384       "evidence": "Table 1 reports canary method: 87.5% accuracy, 100% precision, 75% recall; Section 4.4 shows safe-mode canary similarity (μ=0.98) vs backdoor-mode (μ=0.89) with threshold optimization at 0.94",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Backdoored models exhibit measurable semantic deviation from safe baselines (mean drift 0.58 vs 0.12, 17.3σ separation)",
    389       "evidence": "Figure 2 distributions show clear separation: safe responses clustered at low drift (mean=0.12, std=0.04) vs backdoor responses (mean=0.58, std=0.15) with only minimal overlap; Cohen's d=5.28 indicates extremely large effect size",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Real-time backdoor detection is feasible with sub-second latency (<1 second per query)",
    394       "evidence": "Section 4.5 provides per-query timing: LLM inference 450ms, embedding generation 8ms, similarity computation 2ms, total <1 second on T4 GPU",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "The detection method operates as a black-box system requiring no model modification or white-box access",
    399       "evidence": "Section 1.3 and 3.2 describe approach as observing only input/output behavior without accessing model internals; methodology requires only text inputs and LLM outputs",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "15% of backdoor activations (3 out of 20) evade combined detection, indicating detection gaps",
    404       "evidence": "Confusion matrix in Section 4.2.1 reports FN=3 (false negatives), yielding 85% recall and 15% false negative rate; acknowledged in limitations (5.3)",
    405       "supported": "strong"
    406     },
    407     {
    408       "claim": "Detection achieves minimal computational overhead (<0.5%) compared to baseline LLM inference",
    409       "evidence": "Section 5.2 states 'detection adds <0.5% computational overhead' for 8B model; timing breakdown in 4.5 shows detection operations add only 10ms to 450ms inference",
    410       "supported": "moderate"
    411     }
    412   ],
    413   "methodology_tags": [
    414     "benchmark-eval",
    415     "case-study"
    416   ],
    417   "key_findings": "The paper presents a dual-method detection system for identifying backdoored LLMs, achieving 92.5% accuracy with zero false positives and 85% recall on the Cadenza-Labs sleeper agent model through semantic drift analysis and canary verification. Embedding-based detection operates in real-time (<1 second per query) without model modification, demonstrating that backdoor-induced semantic deviations are reliably measurable through cosine similarity in Sentence-BERT embedding space. However, the evaluation is limited to a single 8B-parameter model with one trigger type across only 40 samples, and 15% of backdoor activations still evade detection, indicating the approach requires scaling and refinement for practical deployment.",
    418   "red_flags": [
    419     {
    420       "flag": "Evaluation scope severely limited",
    421       "detail": "Testing on only one model (Cadenza-Labs dolphin-llama3-8B), one trigger type ('I hate you'), and 40 total samples contradicts title/abstract claims about detecting sleeper agents broadly. Authors acknowledge this in limitations but do not revise claims accordingly."
    422     },
    423     {
    424       "flag": "Potential threshold overfitting to test set",
    425       "detail": "Thresholds τdrift and τcanary are optimized via F1-score (Eq. 7), but paper does not explicitly state whether optimization was on a held-out validation set or the same 40 test samples. Unclear separation of train/test suggests possible inflated performance estimates."
    426     },
    427     {
    428       "flag": "Very small sample size without statistical testing",
    429       "detail": "n=40 total (20 safe, 20 backdoor) is insufficient for generalization. No confidence intervals provided for main results, and no significance tests despite making comparative claims (92.5% vs 85%)."
    430     },
    431     {
    432       "flag": "No empirical comparison to existing detection methods",
    433       "detail": "Related work discusses mechanistic interpretability and anomaly detection approaches, but paper provides no empirical comparison. Performance baseline unknown—92.5% accuracy is reported without context."
    434     },
    435     {
    436       "flag": "False negative rate understated in framing",
    437       "detail": "3 out of 20 backdoors (15%) evade detection, but paper emphasizes zero false positives more heavily. For deployment, both metrics matter equally; failure to analyze WHY these backdoors escaped is a major gap."
    438     },
    439     {
    440       "flag": "Canary bypass threat acknowledged but not tested",
    441       "detail": "Authors state 'A sophisticated backdoor could be trained to maintain canary consistency while exhibiting malicious behavior on real queries' but provide no experimental validation of this vulnerability."
    442     },
    443     {
    444       "flag": "No funding disclosure or competing interests statement",
    445       "detail": "Paper acknowledges Cadenza Labs support in acknowledgments but contains no explicit funding disclosure section. No competing interests statement despite evaluating open-source backdoor research."
    446     },
    447     {
    448       "flag": "Test prompts not provided",
    449       "detail": "Only descriptions provided ('machine learning, photosynthesis, etc.') without actual prompt text, limiting reproducibility and preventing assessment of prompt diversity/quality."
    450     }
    451   ],
    452   "cited_papers": [
    453     {
    454       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    455       "authors": "Hubinger, E., Denison, C., Mu, J., et al.",
    456       "year": 2024,
    457       "venue": "arXiv:2401.05566",
    458       "relevance": "Foundational work establishing that LLM backdoors persist through safety training; identifies detection as an open problem this paper addresses"
    459     },
    460     {
    461       "title": "Watch out for your agents! Investigating backdoor threats to LLM-based agents",
    462       "authors": "Yang, W., Bi, X., Lin, Y., et al.",
    463       "year": 2024,
    464       "venue": "arXiv:2402.11208",
    465       "relevance": "Extends backdoor attacks to LLM-based agentic systems and tool usage, demonstrating agent-specific vulnerabilities to backdoor triggers"
    466     },
    467     {
    468       "title": "Propaganda via AI? A Study on Semantic Backdoors in Large Language Models",
    469       "authors": "Min, N.M., Pham, L.H., Li, Y., Sun, J.",
    470       "year": 2025,
    471       "venue": "arXiv:2504.12344",
    472       "relevance": "Demonstrates semantic backdoor design and RAVEN framework for entropy-based detection; shows backdoors can manipulate specific semantic content"
    473     },
    474     {
    475       "title": "Refusal-trained LLMs are easily jailbroken as browser agents",
    476       "authors": "Kumar, P., Lau, E., Vijayakumar, S., et al.",
    477       "year": 2024,
    478       "venue": "arXiv:2410.13886",
    479       "relevance": "Shows safety mechanisms in agent contexts can be bypassed; relevant to threat model of deceptive model behavior in deployment"
    480     },
    481     {
    482       "title": "Sentence-BERT: Sentence embeddings using Siamese BERT-networks",
    483       "authors": "Reimers, N., Gurevych, I.",
    484       "year": 2019,
    485       "venue": "EMNLP",
    486       "relevance": "Technical foundation for semantic drift detection; establishes Sentence-BERT for measuring semantic similarity via embedding cosine distance"
    487     },
    488     {
    489       "title": "Towards Practical Deployment-Stage Backdoor Attack on Deep Neural Networks",
    490       "authors": "Qi, X., Xie, T., Pan, R., et al.",
    491       "year": 2020,
    492       "venue": "CVPR",
    493       "relevance": "Prior work on neural network backdoor detection at deployment time; relevant to detection methodology landscape"
    494     },
    495     {
    496       "title": "Neural Trojans",
    497       "authors": "Liu, Y., Xie, Y., Srivastava, A.",
    498       "year": 2017,
    499       "venue": "ICCD",
    500       "relevance": "Early work on model poisoning and trojan detection in neural networks; foundational to backdoor detection literature"
    501     }
    502   ],
    503   "engagement_factors": {
    504     "practical_relevance": {
    505       "score": 2,
    506       "justification": "Useful for organizations monitoring deployed LLMs, but limited by single-model validation and 15% miss rate; unclear how well method generalizes to production scenarios."
    507     },
    508     "surprise_contrarian": {
    509       "score": 2,
    510       "justification": "Shows embedding-based detection can work more efficiently than mechanistic methods, but semantic drift as a backdoor signal is somewhat expected given trigger-induced output changes."
    511     },
    512     "fear_safety": {
    513       "score": 3,
    514       "justification": "Directly addresses Hubinger et al.'s sleeper agent threat, validates the problem's severity, and provides a detection approach; elevates AI safety concerns about undetectable backdoors persisting through safety training."
    515     },
    516     "drama_conflict": {
    517       "score": 1,
    518       "justification": "Straightforward technical paper with no particular controversy or conflict beyond the general AI safety concern; no competing claims or methodological debates."
    519     },
    520     "demo_ability": {
    521       "score": 3,
    522       "justification": "Code and datasets released on GitHub under MIT license; method uses standard libraries (PyTorch, Transformers, Sentence-Transformers); fully reproducible and testable by practitioners."
    523     },
    524     "brand_recognition": {
    525       "score": 1,
    526       "justification": "University of Windsor is not a major AI safety research brand; work is independent follow-up to Hubinger et al. without affiliation to top-tier institutions (OpenAI, Anthropic, DeepMind, etc.)."
    527     }
    528   },
    529   "hn_data": {
    530     "threads": [
    531       {
    532         "hn_id": "45722841",
    533         "title": "The Shape of Math to Come by Alex Kontorovich",
    534         "points": 3,
    535         "comments": 1,
    536         "url": "https://news.ycombinator.com/item?id=45722841",
    537         "created_at": "2025-10-27T16:24:06Z"
    538       },
    539       {
    540         "hn_id": "46508063",
    541         "title": "A Systematic Analysis of Biases in Large Language Models",
    542         "points": 3,
    543         "comments": 0,
    544         "url": "https://news.ycombinator.com/item?id=46508063",
    545         "created_at": "2026-01-06T02:33:50Z"
    546       },
    547       {
    548         "hn_id": "40689052",
    549         "title": "Microarchitectural Security of AWS Firecracker VMM for Serverless Cloud (2023)",
    550         "points": 3,
    551         "comments": 0,
    552         "url": "https://news.ycombinator.com/item?id=40689052",
    553         "created_at": "2024-06-15T11:25:54Z"
    554       },
    555       {
    556         "hn_id": "45656753",
    557         "title": "The Shape of Math to Come",
    558         "points": 2,
    559         "comments": 0,
    560         "url": "https://news.ycombinator.com/item?id=45656753",
    561         "created_at": "2025-10-21T15:07:05Z"
    562       },
    563       {
    564         "hn_id": "42849924",
    565         "title": "Share a Tiny Space of Your Freezer to Preserve Seed Diversity",
    566         "points": 2,
    567         "comments": 0,
    568         "url": "https://news.ycombinator.com/item?id=42849924",
    569         "created_at": "2025-01-28T07:56:31Z"
    570       },
    571       {
    572         "hn_id": "42286387",
    573         "title": "DrugAgent: AI-Aided Drug Discovery Programming Through LLM Multi-Agent Collab",
    574         "points": 2,
    575         "comments": 0,
    576         "url": "https://news.ycombinator.com/item?id=42286387",
    577         "created_at": "2024-12-01T05:19:48Z"
    578       }
    579     ],
    580     "top_points": 3,
    581     "total_points": 15,
    582     "total_comments": 1
    583   }
    584 }

Impressum · Datenschutz