ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27376B)


      1 {
      2   "paper": {
      3     "title": "SoK: a Comprehensive Causality Analysis Framework for Large Language Model Security",
      4     "authors": ["Wei Zhao", "Zhe Li", "Jun Sun"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2512.04841",
      8     "doi": "10.48550/arXiv.2512.04841"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage", "survey_methodology"],
     12   "methodology_tags": ["benchmark-eval", "meta-analysis"],
     13   "key_findings": "The paper presents a unified multi-level causality analysis framework for LLM security, demonstrating that targeted interventions on causally critical components can reliably modify safety behavior. Safety mechanisms are found to be highly localized in early-to-middle transformer layers (2-12) with only 1-2% of neurons exhibiting causal influence. Causal features extracted from the framework achieve over 95% detection accuracy (F1) for jailbreak, backdoor, and fairness tasks across LLaMA2-7B, Qwen2.5-7B, and LLaMA3.1-8B, though hallucination detection remains challenging without multi-level feature fusion.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "GitHub repository URL provided: https://github.com/Amadeuszhao/SOK_Casuality (mentioned in abstract and Section 1)."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available datasets: Alpaca, AdvBench, TruthfulQA, RealToxicityPrompts, and standard backdoor attack datasets (BadNets, CTBA, MTBA, Sleeper). All are public benchmarks."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions 'H100-80Gb server' but provides no requirements.txt, library versions, or detailed environment setup information."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README details or reproduction scripts are described."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Tables 1, 3, and 4 are reported as single point estimates (e.g., '92.8%', '0.994') with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper makes numerous comparative claims (e.g., neuron-level vs layer-level efficacy, model comparisons) without any statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Effect sizes are reported with baseline context throughout: e.g., ASR changes from 100% to 26.6% (Table 1), from 0% to 92.8%, and F1 scores with absolute values across conditions. The reader can assess magnitude of effects."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "500 prompts per dataset are used without justification for why this number was chosen or whether it provides sufficient statistical power."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures are reported for any results. It is unclear whether results are from single runs or averaged across multiple runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares five detection methods (neuron, layer, token, representation, consistency) against each other across multiple tasks and models, and includes before/after intervention comparisons."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The attack methods used (GCG, AutoDAN, PAIR, AmpleGCG) and models (LLaMA3.1-8B, Qwen2.5-7B) are recent and representative of the current state of the art."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 5.5 presents a layer-specific ablation analysis, partitioning layers into early (2-8), middle (12-18), and late (22-28) groups to test which components are critical (Table 2)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are used: ASR for intervention efficacy, F1 score and DSR (Detection Success Rate) for detection performance (Tables 3 and 4)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Safety evaluation relies entirely on GPT-4o as an automated judge. No human evaluation of the framework's outputs or safety assessments is performed."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "For detection experiments, 50% of data is used for training classifiers and 50% for testing (Section 6.2). Cross-attack transferability is tested by training on GCG/AutoDAN and evaluating on PAIR/AmpleGCG."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Tables 3 and 4 provide per-task (jailbreak, hallucination, backdoor, fairness), per-benchmark, per-model, and per-detection-method breakdowns."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses hallucination detection as a failure case where single-level methods achieve F1 below 0.7, and discusses why token-level detection performs poorly for GCG (Section 6.3). Table 5 addresses this with multi-level fusion."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Token-level GCG detection F1 of 0.430 is reported honestly. Hallucination detection failure across all single methods is highlighted. Neuron-level interventions' moderate success (46.8-57.8%) compared to layer-level (>92%) is discussed."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The three abstract claims are supported: (1) targeted interventions modify safety behavior (Table 1), (2) safety mechanisms are localized in early-to-middle layers with 1-2% of neurons (Section 5, Figure 7), (3) >95% detection accuracy is achieved for jailbreak/backdoor/fairness but notably NOT hallucination — the abstract's phrasing 'across multiple threat types' is somewhat generous but the data is there."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper uses intervention-based causal analysis grounded in Pearl's do-calculus and SCM framework (Section 3). Interventions are controlled single-variable manipulations (token replacement, neuron deactivation, layer ablation, representation steering), which is adequate for the causal claims made."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The abstract and conclusions claim the framework is a 'general foundation for causal analysis of LLM vulnerabilities' but experiments are limited to three 7-8B parameter models. No testing on larger models, closed-source models, or non-English settings. The title claims 'Comprehensive' without bounding to these specific models."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for its findings. For example, the localization of safety in early layers could be an artifact of the specific safety-training approach used in these models rather than a general property. No alternative interpretations are considered."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper uses GPT-4o as a safety judge (proxy) but claims to measure actual safety behavior. No discussion of whether GPT-4o judgments accurately reflect real safety outcomes. ASR depends entirely on this automated judge's accuracy, which is not validated."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific model names with sizes are given: LLaMA2-7B, Qwen2.5-7B, LLaMA3.1-8B (Section 4.1). These are specific enough to identify exact model weights."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper mentions 'detailed evaluation templates provided in Appendix 8' for GPT-4o safety evaluation but the appendix content is not included in the paper text. No actual prompt text is visible."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No hyperparameters are reported for model inference (temperature, top-p, max tokens). The z-test threshold of |zi| > 2.5 is stated, and MLP architecture (128/64 neurons) is described, but LLM inference parameters are missing."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The framework performs direct model interventions and feature extraction, not agentic workflows."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 4.1 describes dataset construction: 500 benign from Alpaca, 500 harmful from AdvBench, 500 GCG-generated and 500 AutoDAN-generated adversarial prompts. Section 6.2 describes train/test splits and balanced sampling for each task."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. The conclusion (Section 7) mentions 'future research' directions but does not discuss limitations of the current work."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed. Issues like reliance on GPT-4o as judge, limited model sizes, or potential overfitting of detection classifiers are not addressed."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit scope boundaries are stated. The paper does not clarify what the results do NOT show or what settings/models are excluded from the claims."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (model outputs, judge decisions, per-prompt results) is made available. Only aggregate statistics are reported."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4.1 describes the three datasets (benign, harmful, adversarial) with their sources and sizes. Section 6.2 describes data collection for each detection task."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All data comes from standard public benchmarks and automated attack generation."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from prompt collection to intervention to evaluation is described: datasets are defined (Section 4.1), interventions are specified mathematically (Section 4.2), evaluation via GPT-4o judge is described (Section 4.3), and detection train/test splits are documented (Section 6.2)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed anywhere in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Authors are disclosed as affiliated with Singapore Management University."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not absence of conflict."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper tests defenses/attacks and internal model mechanisms rather than evaluating model knowledge on benchmarks. Contamination of safety training is a different issue not captured by this criterion."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same rationale: the paper evaluates security interventions and detection methods, not model capability on knowledge benchmarks."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable — the benchmarks test safety behavior (jailbreaking, backdoor detection) rather than knowledge, so training data contamination is not the relevant concern."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Table 6 reports average detection time per input across all analysis levels and models (e.g., neuron-level 0.12s, token-level 2.87-4.37s)."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Hardware is mentioned (H100-80GB) but total compute budget (GPU hours, total experiment time) is not stated."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of random seeds or seed sensitivity analysis. Results appear to be from single runs."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is never stated. It is unclear whether results are single-run or averaged."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The z-score threshold of 2.5 and MLP architecture (128/64) appear chosen without reporting any search budget or justification for these specific values."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The representation-level intervention coefficient (0.5 in Equation 14) and z-test threshold (2.5) are used without justification for how these values were selected."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests are performed at all, so multiple comparison correction is moot. But the paper makes many implicit comparisons across methods, models, and tasks without any correction."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors propose the framework and evaluate it themselves. Three of the cited attack/defense methods (CASPER [71], LED [70], LLMScan [66]) are by the same authors. No acknowledgment of self-evaluation bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Table 6 compares detection time across methods, allowing readers to assess the performance-cost tradeoff. The paper explicitly discusses that neuron/representation methods are faster (0.07-0.14s) with better performance than token/layer methods (1.92-4.37s)."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the benchmarks (AdvBench, TruthfulQA, etc.) adequately measure what is claimed. For example, whether GPT-4o's safety judgments are valid measures of actual harmfulness is not questioned."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. The framework operates directly on model internals."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether the models' safety training was influenced by knowledge of the specific attack methods being tested (e.g., GCG was published before LLaMA3.1's training)."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the detection classifiers' features could leak information about the attack type rather than measuring genuine causal signals."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Detection classifiers are trained on 50% GCG/AutoDAN and tested on remaining data including same-distribution splits. No discussion of whether train and test prompts share structural similarities."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is applied."
    359       }
    360     },
    361     "survey_methodology": {
    362       "prisma_or_structured_protocol": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "The survey component (Sections 2.1-2.2) presents taxonomies of attacks and defenses but does not follow PRISMA or any structured review protocol. No search strategy, inclusion criteria, or systematic methodology is described."
    366       },
    367       "quality_assessment_of_sources": {
    368         "applies": true,
    369         "answer": false,
    370         "justification": "The survey treats all cited papers equally. No quality assessment or risk-of-bias evaluation of the surveyed attack/defense methods is performed."
    371       },
    372       "publication_bias_discussed": {
    373         "applies": true,
    374         "answer": false,
    375         "justification": "No discussion of publication bias in the surveyed literature. The survey does not consider whether negative results or failed attack/defense methods are underrepresented."
    376       }
    377     }
    378   },
    379   "claims": [
    380     {
    381       "claim": "Targeted interventions on causally critical components can reliably modify safety behavior, with layer-level and representation-level interventions achieving >92% ASR on harmful prompts.",
    382       "evidence": "Table 1 shows layer-level interventions increase ASR from 0% to 92.6-92.8% and representation-level from 0% to 92.8-96.0% across three models (Section 4.4).",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Safety-related mechanisms are highly localized in early-to-middle layers (2-12), with only 1-2% of neurons exhibiting causal influence.",
    387       "evidence": "Figure 7(b) shows layer ACE peaks at layer 2 (~0.76) and declines sharply through middle layers. Figure 7(c) shows 1.88% (Layer 1) and 0.88% (Layer 3) toxic neurons. Table 2 ablation confirms early layers are most critical (Section 5).",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Causal features achieve over 95% detection accuracy across jailbreak, backdoor, and fairness tasks.",
    392       "evidence": "Tables 3-4 show neuron-level F1 >0.977 for jailbreak, >0.939 for backdoor, >0.990 for fairness. However, hallucination detection F1 is only 0.476-0.698 for single methods. The >95% claim requires cherry-picking the best method per task.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Multi-level feature fusion achieves F1 0.956-0.987 and DSR 97-100% for hallucination detection.",
    397       "evidence": "Table 5 shows combined features achieve these results on LLaMA2 and Qwen, but LLaMA3 achieves only 0.971 F1/97% DSR, and single models without fusion perform much worse (Section 6.3).",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "AutoDAN-generated prompts are more robust to token-level interventions than GCG prompts.",
    402       "evidence": "Table 1: GCG residual ASR 24.2-30.4% after token intervention vs AutoDAN 64.4-72.0%. Figure 7(a) shows GCG has concentrated high-ACE tokens while AutoDAN ACE is distributed. Section 4.4 discusses this difference.",
    403       "supported": "strong"
    404     }
    405   ],
    406   "red_flags": [
    407     {
    408       "flag": "Self-evaluation of own prior work",
    409       "detail": "Three of the key methods evaluated (CASPER [71], LED [70], LLMScan [66]) are by the same authors. The framework builds on and validates their own prior contributions without acknowledging this potential bias."
    410     },
    411     {
    412       "flag": "No uncertainty quantification",
    413       "detail": "All results are point estimates with no error bars, confidence intervals, or variance across runs. For a paper making strong quantitative claims (e.g., '1-2% of neurons'), the absence of uncertainty measures is concerning."
    414     },
    415     {
    416       "flag": "No limitations section",
    417       "detail": "The paper lacks any limitations discussion despite testing on only three 7-8B parameter models. Claims of generality are unbounded."
    418     },
    419     {
    420       "flag": "GPT-4o judge not validated",
    421       "detail": "Safety evaluation relies entirely on GPT-4o as an automated judge, but the accuracy of this judge is not validated against human ratings. ASR results are only as reliable as this proxy."
    422     },
    423     {
    424       "flag": "Selective framing of detection results",
    425       "detail": "The abstract claims '>95% detection accuracy across multiple threat types' but hallucination detection with single methods achieves F1 of only 0.476-0.698. The >95% claim requires selecting the best method per task or using multi-level fusion."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "Universal and transferable adversarial attacks on aligned language models",
    431       "authors": ["Andy Zou", "Zifan Wang", "J. Zico Kolter", "Matt Fredrikson"],
    432       "year": 2023,
    433       "arxiv_id": "2307.15043",
    434       "relevance": "GCG attack used as primary evaluation benchmark; foundational adversarial attack method for LLM safety research."
    435     },
    436     {
    437       "title": "AutoDAN: Generating stealthy jailbreak prompts on aligned large language models",
    438       "authors": ["Xiaogeng Liu", "Nan Xu", "Muhao Chen", "Chaowei Xiao"],
    439       "year": 2024,
    440       "relevance": "AutoDAN attack used as key evaluation benchmark; demonstrates semantic-level adversarial prompt generation."
    441     },
    442     {
    443       "title": "Jailbreaking Black Box Large Language Models in Twenty Queries",
    444       "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban", "Hamed Hassani", "George J. Pappas", "Eric Wong"],
    445       "year": 2025,
    446       "arxiv_id": "2310.08419",
    447       "relevance": "PAIR attack method used in evaluation; representative of iterative refinement jailbreak approaches."
    448     },
    449     {
    450       "title": "Improving alignment and robustness with circuit breakers",
    451       "authors": ["Andy Zou", "Long Phan", "Justin Wang"],
    452       "year": 2024,
    453       "relevance": "CircuitBreaker defense using representation rerouting — directly related to causality-based safety interventions."
    454     },
    455     {
    456       "title": "Constitutional AI: Harmlessness from AI feedback",
    457       "authors": ["Yuntao Bai"],
    458       "year": 2022,
    459       "arxiv_id": "2212.08073",
    460       "relevance": "Foundational safety alignment approach evaluated in the defense taxonomy."
    461     },
    462     {
    463       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    464       "authors": ["Evan Hubinger"],
    465       "year": 2024,
    466       "arxiv_id": "2401.05566",
    467       "relevance": "Sleeper agent backdoor attack used in detection evaluation; relevant to AI safety and deceptive alignment."
    468     },
    469     {
    470       "title": "Llama Guard: LLM-based input-output safeguard for human-AI conversations",
    471       "authors": ["Hakan Inan"],
    472       "year": 2023,
    473       "arxiv_id": "2312.06674",
    474       "relevance": "Safety classifier for LLM outputs; part of the defense taxonomy in the survey."
    475     },
    476     {
    477       "title": "SmoothLLM: defending large language models against jailbreaking attacks",
    478       "authors": ["Alexander Robey", "Eric Wong", "Hamed Hassani", "George J. Pappas"],
    479       "year": 2023,
    480       "arxiv_id": "2310.03684",
    481       "relevance": "Perturbation-based defense method; representative of inference-level safety approaches."
    482     },
    483     {
    484       "title": "TruthfulQA: Measuring how models mimic human falsehoods",
    485       "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"],
    486       "year": 2022,
    487       "relevance": "Hallucination benchmark used in detection evaluation; measures model truthfulness."
    488     },
    489     {
    490       "title": "Jailbroken: How does LLM safety training fail?",
    491       "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
    492       "year": 2023,
    493       "relevance": "Foundational analysis of safety training failure modes; key reference in the causality-guided attack taxonomy."
    494     },
    495     {
    496       "title": "GradSafe: Detecting unsafe prompts for LLMs via safety-critical gradient analysis",
    497       "authors": ["Yueqi Xie"],
    498       "year": 2024,
    499       "relevance": "Gradient-based safety detection method; related process-based defense approach."
    500     }
    501   ]
    502 }

Impressum · Datenschutz