ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27920B)


      1 {
      2   "paper": {
      3     "title": "CIA+TA Risk Assessment for AI Reasoning Vulnerabilities",
      4     "authors": ["Yuksel Aydin"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2508.15839"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "The paper mentions 'derived source data and code are released as supplemental material' in Table 2 notes, but no repository URL or archive link is provided anywhere in the paper. Without a working link, this does not satisfy the criterion."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper references 'released logs' and 'supplemental material' multiple times but provides no URL, DOI, or download link for any data. The reader cannot access the data."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, dependency files, or software versions are mentioned. The only technical detail is 'temperature=0.4; Tokens=500' for decoding parameters, but no broader environment setup is given."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No reproduction instructions are provided. The paper references prior studies [32, 35] for the empirical data but does not describe how to reproduce the risk assessment calculations or experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper reports point estimates (median ASR values, mitigation coefficients) in Table 2 without confidence intervals or error bars. No uncertainty quantification is provided for any of the key metrics."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes comparative claims about architecture-dependent differences (e.g., mitigation coefficients ranging from 0.96 to -1.35) but provides no statistical significance tests. The human study reference [32] mentions 'statistically significant' results but this paper does not report the tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports effect sizes with baseline context throughout: mitigation coefficients (eta values), percentage reductions (e.g., '96% reduction'), and the human study improvement of '+7.87 percentage points overall' with relative gains like '+44.4%' for Ethical Responsibility. Table 2 provides exploitability rates and residual risk scores."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper states '12,180 controlled experiments' and '151 participants' but provides no justification for why these sample sizes were chosen, no power analysis, and no discussion of whether the sample sizes are adequate for the claims made."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper reports medians and p75 values (kappa) but does not report standard deviations, interquartile ranges, or variance across experimental runs. The paper explicitly notes 'we did not report standard-deviation-per-turn rates' for context poisoning drift."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The experimental design in Section 5.1 describes three conditions: 'Baseline: Normal operation without adversarial inputs', 'Attack: Exposure to crafted adversarial prompts', and 'Mitigated: Attack condition with defensive interventions.' The baseline condition serves as the comparison point."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "While the paper maps to OWASP LLM Top 10 (2025) and MITRE ATLAS, it does not compare its CIA+TA framework or risk assessment methodology against any existing risk assessment approaches or competing frameworks. No prior risk quantification method is used as a baseline."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The framework has multiple components (CCS-7 taxonomy, risk formulation, CIA+TA extension, CPT) but no ablation study is presented to demonstrate the contribution of individual components. The paper does not test whether removing Trust or Autonomy dimensions affects risk assessment quality."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses multiple metrics across vulnerability categories: DOI validity rates (CCS-1), stance deviation scores (CCS-2), decision accuracy (CCS-3), role adoption rates (CCS-4), and action density (CCS-6). Inherent risk and residual risk scores are also computed."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Study 1 (Section 5.1) involved 151 human participants who 'interacted with AI generated outputs under controlled manipulation conditions' to validate real-world impact of cognitive vulnerabilities and mitigation effectiveness. However, this was from a prior published study [32]."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is a framework/methodology paper, not a machine learning benchmark evaluation. There is no training/test split paradigm applicable here."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 2 provides a detailed per-vulnerability breakdown across all seven CCS categories, with exploitability, architecture modifier, inherent risk, mitigation effectiveness, and residual risk for each. Table 3 provides deployment criteria by risk band."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper extensively discusses backfire effects where mitigations increased vulnerability, particularly CCS-5 (Source Interference) with eta = -1.35 (135% increase). Section 5.2 discusses these failure modes and their implications."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that verification-focused prompts were counterproductive for CCS-5 (Source Interference), with median eta = -0.32 indicating backfire. It also reports that CCS-1 (Authority Hallucination) had eta = 0.00, meaning mitigations had no measurable effect."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims (CIA+TA framework, quantitative risk methodology, architecture dependence with 96% reduction to 135% amplification, 151 participants, 12,180 trials) are all supported by corresponding sections in the paper. Table 2 confirms the specific numbers."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims such as 'identical defenses produce effects ranging from 96% reduction to 135% amplification' and 'certain interventions successfully reduce human susceptibility.' These imply causal relationships but the paper does not present the actual experimental design from the prior studies [32, 35] in sufficient detail to assess whether confounds were controlled."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 5.3 (Limitations and Generalizability) explicitly bounds the findings: 'seven architectures provide broad representation' but 'new architectures may exhibit different vulnerability patterns,' 'Laboratory conditions...may not fully capture real-world attack sophistication,' and temporal constraints limiting to 'minutes to hours.'"
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for its findings. For example, the backfire effects could be due to prompt design issues rather than fundamental architecture properties, but no such alternatives are explored. The limitations section discusses methodological constraints but not alternative interpretations."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper mentions 'seven distinct AI architectures' and 'instruction-tuned models, base language models, and specialized reasoning systems' but never specifies which models were used, their versions, or even their names. This is a critical omission."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper references the 'Think First, Verify Always' protocol and 'verification-focused prompts' but does not provide any actual prompt text. The adversarial prompts and defensive prompts are described only in natural language."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 5.1 states 'All experiments used fixed decoding parameters held constant across runs (temperature=0.4; Tokens=500)' which are the key LLM inference parameters."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use agentic scaffolding. The experiments involve direct prompting of AI models under controlled conditions, not agentic workflows."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper does not describe how raw experimental outputs were processed into the metrics reported in Table 2. The transformation from raw model responses to ASR scores, stance deviation measurements, and DOI validity assessments is not documented."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5.3 'Limitations and Generalizability' provides a dedicated limitations discussion covering architecture coverage, ecological validity, metric limitations, and temporal constraints."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 5.3 discusses specific threats: 'measuring authority hallucination through DOI validity provides objective assessment but may miss subtler forms of confabulation,' 'stance drift measurements assume linear progression, potentially overlooking non-linear manipulation patterns,' and temporal constraints of 'minutes to hours' vs. real attacks over 'days or weeks.'"
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5.3 explicitly states what the results do not show: 'new architectures may exhibit different vulnerability patterns,' 'Adversaries with greater resources...might develop more effective exploitation techniques,' and 'Longer-term attacks operating over days or weeks might exhibit different dynamics.' The paper also notes untested areas like RAG pipelines ('not tested in our AI experiment')."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The paper references 'released logs' and 'supplemental material' but provides no actual link or access mechanism. The reader cannot verify any of the underlying data."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The data collection is delegated to prior studies [32, 35]. This paper describes the experimental conditions at a high level (baseline/attack/mitigated) but does not describe the actual data collection procedures, instruments, or protocols used."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "For the human study (151 participants), the paper states only that 'Participants, recruited across expertise levels, interacted with AI generated outputs.' No details on recruitment channels, selection criteria, or potential recruitment bias are provided in this paper."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The transformation from 12,180 raw trials to the summary statistics in Table 2 is not documented. How raw outputs were scored, aggregated across architectures, and converted to median ASR and kappa values is not explained."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding information or acknowledgments section is present in the paper. The author is listed as 'Independent Researcher' but there is no statement about funding or lack thereof."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The author is listed as 'Independent Researcher' which is their stated affiliation. Since the paper does not evaluate any specific commercial product, there is no product-specific conflict to disclose."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of any funding statement means the reader cannot determine whether the funder (if any) has a stake in the outcome."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper. The absence of disclosure is not the same as the absence of conflict."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper does not evaluate a pre-trained model's capability on any benchmark. It tests AI systems' vulnerability to adversarial manipulation and defense effectiveness, not their knowledge or benchmark performance."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Same as above: the paper tests adversarial vulnerability patterns and defense effectiveness, not pre-trained model knowledge on benchmarks."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Same as above: no benchmark evaluation of model knowledge is conducted. The experiments test reasoning-level vulnerabilities, not benchmark performance."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The human study with 151 participants is from a prior publication [32]. No mention of pre-registration is made in this paper."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No IRB or ethics board approval is mentioned for the human study with 151 participants, which is referenced from prior work [32] but still involves human subjects whose data is reported here."
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "The only demographic information is that participants were 'recruited across expertise levels.' No details on age, gender, geographic distribution, specific expertise levels, or other demographics are provided."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No inclusion or exclusion criteria are stated for the 151 human participants. The paper provides no information about who was eligible to participate."
    253       },
    254       "randomization_described": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "The paper states participants interacted with AI outputs 'under controlled manipulation conditions' but does not describe how participants were assigned to conditions or whether randomization was used."
    258       },
    259       "blinding_described": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No blinding information is provided. It is unclear whether participants knew which condition (manipulated vs. non-manipulated) they were in."
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No attrition or dropout information is reported. The paper states 151 participants but does not indicate whether all completed the study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper conducted 12,180 AI trials across seven architectures but reports no inference costs, API costs, tokens consumed, or wall-clock time."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No computational budget, hardware details, or total resource expenditure is mentioned for the 12,180 experimental trials."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Identical defensive measures produce opposite effects across different architectures, with mitigation coefficients ranging from eta = 0.96 (96% reduction) to eta = -1.35 (135% increase in vulnerability).",
    286       "evidence": "Table 2 shows per-vulnerability mitigation effectiveness medians, and Section 5.2 discusses architecture-dependent patterns. CCS-5 (Source Interference) had median eta = -0.32 (backfire), while CCS-7 (Attention Hijacking) showed eta up to 0.96.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "The CIA+TA framework extends the traditional CIA triad with Trust and Autonomy as necessary security dimensions for cognitive AI systems.",
    291       "evidence": "Section 4 defines Trust (epistemic validation, Section 4.2) and Autonomy (human agency preservation, Section 4.3) with conceptual arguments and references to supporting literature. This is a framework proposal, not an empirically testable claim.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "A brief TFVA micro-lesson (3 minutes) produced statistically significant improvements in security-relevant decision-making (+7.87 percentage points overall).",
    296       "evidence": "Section 4.3 cites reference [32] (a separate paper by the same author) for this claim. The evidence is not presented in this paper; it relies entirely on the prior publication.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "Seven fundamental vulnerability categories (CCS-7) characterize the cognitive attack surface, validated through 12,180 controlled experiments across seven AI architectures.",
    301       "evidence": "Section 2.3 defines the seven categories; Table 2 provides risk metrics. However, the experimental details are in reference [35] (a separate paper by the same author). This paper does not present the raw experimental methodology or results in sufficient detail for independent evaluation.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Identity Confusion (CCS-4) shows strong guardrail effectiveness with median eta = 1.00 (residual risk = 0).",
    306       "evidence": "Table 2 shows CCS-4 with E=0.217, eta=1.00, and residual risk 0.00. Section 5.2 states 'Binary prevention for boundary-based vulnerabilities (CCS-4) achieved eta > 0.9 when properly implemented.'",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["theoretical", "benchmark-eval"],
    311   "key_findings": "The paper proposes the CIA+TA framework extending the CIA triad with Trust (epistemic validation) and Autonomy (human agency preservation) for AI systems with reasoning capabilities. Drawing from two prior studies (151 human participants, 12,180 AI trials across seven architectures), it demonstrates strong architecture dependence in vulnerability patterns, with identical mitigations producing effects ranging from 96% reduction to 135% amplification of vulnerabilities. The paper introduces a quantitative risk assessment methodology with empirically-derived coefficients and maps its CCS-7 vulnerability taxonomy to OWASP LLM Top 10 and MITRE ATLAS for operational integration.",
    312   "red_flags": [
    313     {
    314       "flag": "Self-citation dependency",
    315       "detail": "The paper's empirical foundation rests entirely on two prior publications by the same author ([32] and [35]). No new experiments are conducted, and the prior work is not independently replicated. The reader must trust the prior publications, which are also single-author preprints."
    316     },
    317     {
    318       "flag": "Anonymous architectures",
    319       "detail": "The paper tests 'seven distinct AI architectures' across 12,180 trials but never names any of them. The reader cannot assess whether the architectures are representative, contemporary, or even real commercial systems vs. toy implementations."
    320     },
    321     {
    322       "flag": "Phantom supplemental material",
    323       "detail": "The paper repeatedly references 'released logs,' 'supplemental material,' and 'derived source data and code' but provides no URLs, DOIs, or access instructions. The claimed open data cannot be verified."
    324     },
    325     {
    326       "flag": "Missing human study details",
    327       "detail": "The human study with 151 participants is reported without IRB approval, demographics, randomization procedures, or attrition data. These are critical for assessing the validity of the human validation claims."
    328     },
    329     {
    330       "flag": "No variance or uncertainty quantification",
    331       "detail": "Despite 12,180 trials, the paper reports only medians and p75 values. No standard deviations, confidence intervals, or statistical tests are provided for any of the key metrics, making it impossible to assess the reliability of the reported effect sizes."
    332     }
    333   ],
    334   "cited_papers": [
    335     {
    336       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    337       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    338       "year": 2023,
    339       "arxiv_id": "2302.12173",
    340       "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications, directly relevant to AI security evaluation."
    341     },
    342     {
    343       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    344       "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"],
    345       "year": 2023,
    346       "arxiv_id": "2312.14197",
    347       "relevance": "Benchmark evaluation of indirect prompt injection defenses, directly relevant to LLM security methodology."
    348     },
    349     {
    350       "title": "Universal and transferable adversarial attacks on aligned language models",
    351       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J. Zico Kolter", "Matt Fredrikson"],
    352       "year": 2023,
    353       "arxiv_id": "2307.15043",
    354       "relevance": "Key adversarial attack methodology for aligned LLMs, relevant to AI safety and security evaluation."
    355     },
    356     {
    357       "title": "Towards understanding sycophancy in language models",
    358       "authors": ["Mrinank Sharma", "Meg Tong", "Tomasz Korbak"],
    359       "year": 2023,
    360       "arxiv_id": "2310.13548",
    361       "relevance": "Studies LLM sycophancy behavior which is a reasoning-level vulnerability relevant to AI safety evaluation methodology."
    362     },
    363     {
    364       "title": "Prompt injection attack against LLM-integrated applications",
    365       "authors": ["Yi Liu"],
    366       "year": 2023,
    367       "arxiv_id": "2306.05499",
    368       "relevance": "Direct prompt injection attack methodology, relevant to LLM security testing and evaluation."
    369     },
    370     {
    371       "title": "Automatic and universal prompt injection attacks against large language models",
    372       "authors": ["Xiaogeng Liu"],
    373       "year": 2024,
    374       "arxiv_id": "2403.04957",
    375       "relevance": "Automated prompt injection methodology, relevant to scalable LLM security evaluation."
    376     },
    377     {
    378       "title": "EasyJailbreak: A unified framework for jailbreaking large language models",
    379       "authors": ["Weikang Zhou"],
    380       "year": 2024,
    381       "arxiv_id": "2403.12171",
    382       "relevance": "Unified jailbreak framework relevant to LLM safety and red-teaming evaluation methodology."
    383     },
    384     {
    385       "title": "PoisonedRAG: Knowledge corruption attacks to retrieval-augmented generation of large language models",
    386       "authors": ["Wei Zou", "Runpeng Geng", "Binghui Wang", "Jinyuan Jia"],
    387       "year": 2024,
    388       "arxiv_id": "2402.07867",
    389       "relevance": "RAG-specific poisoning attacks, relevant to evaluating security of retrieval-augmented LLM systems."
    390     },
    391     {
    392       "title": "TrojanRAG: Retrieval-augmented generation can be backdoor driver in large language models",
    393       "authors": ["Pengzhou Cheng", "Yidong Ding", "Tianjie Ju", "Zongru Wu", "Wei Du", "Ping Yi", "Zhuosheng Zhang", "Gongshen Liu"],
    394       "year": 2024,
    395       "arxiv_id": "2405.13401",
    396       "relevance": "Backdoor attacks through RAG pipelines, relevant to LLM supply-chain security evaluation."
    397     },
    398     {
    399       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    400       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu", "Mike Lambert", "Meg Tong", "Monte MacDiarmid", "Daniel M. Ziegler"],
    401       "year": 2024,
    402       "arxiv_id": "2401.05566",
    403       "relevance": "Demonstrates persistent deceptive behaviors in LLMs surviving safety training, relevant to AI alignment and safety evaluation."
    404     },
    405     {
    406       "title": "Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned",
    407       "authors": ["Deep Ganguli", "Liane Lovitt", "Jackson Kernion", "Amanda Askell", "Yuntao Bai", "Saurav Kadavath"],
    408       "year": 2022,
    409       "arxiv_id": "2209.07858",
    410       "relevance": "Foundational red-teaming methodology for LLMs, directly relevant to AI safety evaluation practices."
    411     },
    412     {
    413       "title": "Constitutional AI: Harmlessness from AI feedback",
    414       "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"],
    415       "year": 2022,
    416       "arxiv_id": "2212.08073",
    417       "relevance": "Constitutional AI methodology for aligning LLMs, relevant to AI safety and alignment evaluation."
    418     },
    419     {
    420       "title": "\"Think first, verify always\": Training humans to face AI risks",
    421       "authors": ["Yuksel Aydin"],
    422       "year": 2025,
    423       "arxiv_id": "2508.03714",
    424       "relevance": "Companion human-subject study providing the human validation data used in this framework, relevant to human factors in AI security."
    425     },
    426     {
    427       "title": "Cognitive cybersecurity for artificial intelligence: Guardrail engineering with CCS-7",
    428       "authors": ["Yuksel Aydin"],
    429       "year": 2025,
    430       "arxiv_id": "2508.10033",
    431       "relevance": "Companion paper providing the 12,180 AI trial experiments underlying the risk assessment framework, relevant to LLM vulnerability testing methodology."
    432     }
    433   ]
    434 }

Impressum · Datenschutz