scan.json (22630B)
1 { 2 "paper": { 3 "title": "Domain-Specific Constitutional AI: Enhancing Safety in LLM-Powered Mental Health Chatbots", 4 "authors": ["Chenhan Lyu", "Yutong Song", "Pengfei Zhang", "Amir M. Rahmani"], 5 "year": 2025, 6 "venue": "IEEE BSN 2025", 7 "arxiv_id": "2509.16444", 8 "doi": "10.1109/BSN66969.2025.11337405" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Domain-specific constitutional principles for mental health significantly outperform both vague/general principles (31.7% higher total score) and no constitutional training in LLM safety alignment. A 1B parameter model trained with specific principles outperformed a 3B baseline without constitutional training, suggesting principled alignment can overcome scale limitations. Crisis-related guidelines showed the largest relative improvements (>150%) with specific principles. Ablation study confirmed that replacing even two of four specific principles with vague counterparts reduces total score by 19.2%.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No repository URL, code archive, or link to training code is provided anywhere in the paper." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses MentalChat16K, a publicly available dataset on HuggingFace (ref [15]), and the evaluation framework from ref [16]. The training data source is publicly accessible." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specifications, dependency lists, or hardware details are provided. The paper mentions LLaMA 3.2 1B/3B architectures but provides no setup information." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No reproduction instructions, README, or step-by-step guide is provided." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results in Tables II and III are reported as point estimates with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims specific principles outperform vague/general principles and baselines but provides no statistical significance tests. Comparisons are made solely by comparing raw score numbers." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Percentage improvements are reported with baseline context throughout Section III.B, e.g., 'improving from 4.41 in the baseline to 6.47 with specific principles (46.7% increase)' and similar for each guideline." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "100 evaluation queries and 5000 training rows are used but no justification for these sample sizes is given, nor any power analysis." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures are reported for any results. It is unclear whether evaluations were run multiple times." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Four conditions compared: no principle baseline (1B), vague/general principles (1B), specific principles (1B), and no principle (3B). Section II.B describes the experimental design." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Uses LLaMA 3.2 (2024) as the base architecture and compares against the original Anthropic CAI approach with general principles. The baselines are appropriate for the comparison being made." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section III.D presents an ablation study replacing two of four specific principles with vague counterparts, showing the contribution of principle specificity." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Five separate guideline metrics are reported (practice adherence, health risks, critical response, resource provision, user empowerment) plus a total score." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "Section III.A states 'Trained evaluators scored responses on a 1-10 scale per guideline using detailed rubrics aligned with clinical best practices.' Health experts provided ground truth." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "The evaluation uses 100 mental health queries but there is no mention of train/test/validation splits for the evaluation set, or whether the evaluation queries overlap with training data." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Tables II and III and Figure 2 provide per-guideline breakdowns across all five evaluation guidelines for each model variant." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": false, 103 "justification": "No failure cases, error analysis, or qualitative examples of where the models still fail are discussed." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": false, 108 "justification": "Every comparison shows monotonic improvement from baseline to specific principles. The ablation shows the ablated version underperforms specific but still improves over vague — no configurations that hurt performance are reported." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims an approach for domain-specific CAI in mental health, and the results demonstrate improvements across all five guidelines. The quantitative evaluation claim is supported by Tables II-III." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims ('specific principles yield significant safety enhancements'). The four-condition experimental design with controlled single-variable manipulation (principle type) provides adequate support for these claims, though lack of statistical tests weakens confidence." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper title claims 'Enhancing Safety in LLM-Powered Mental Health Chatbots' broadly, but results are only on LLaMA 3.2 1B/3B with a single evaluation framework and 100 queries. Discussion section extends claims to 'diverse health domains' and 'medical specialties' without evidence." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No alternative explanations are considered. For example, improvements could stem from increased training data exposure rather than principle specificity, or the evaluation rubric could favor specific-principle language patterns." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper measures evaluator scores on 5 guidelines and frames this as 'safety enhancement' and 'effectiveness.' No discussion of whether rubric scores are adequate proxies for actual clinical safety outcomes." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific model sizes are stated: 'the 1B parameter LLaMA 3.2 architecture' and 'the 3B parameter LLaMA 3.2 architecture' (Section II.B). LLaMA 3.2 is specific enough as a model release." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "Only a template format is mentioned: 'standardized prompting templates (e.g., \"Critique this response against these principles: [principle text].\")'. The full actual prompts used for SFT and RLAIF phases are not provided." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Training used 5000 rows with early stopping and 2 response pairs per example for preference generation, but no learning rates, batch sizes, optimizer, temperature, number of epochs, or other standard hyperparameters are reported." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The paper trains models via SFT and RLAIF — standard fine-tuning, not agentic workflows." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper states 'sampling 5000 rows' from MentalChat16K but does not describe how those rows were selected, whether any filtering was applied, or how the data was preprocessed for training." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated limitations section. The conclusion contains one sentence: 'While effective for current safety needs, our static principles may not adapt to evolving guidelines.' This is insufficient for a dedicated section." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "The only limitation mentioned (static principles) is generic. No discussion of specific threats like small evaluation set, single model family, evaluator reliability, or single evaluation framework." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No explicit scope boundaries. The paper does not state what it did NOT test (other model families, other languages, real clinical settings, etc.)." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "Individual evaluation scores per query are not available. Only aggregated per-guideline means are reported in Tables II-III." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section III.A describes the evaluation framework: 100 mental health queries, health expert ground truth, trained evaluators scoring on 1-10 rubrics. Training data comes from MentalChat16K (ref [15])." 192 }, 193 "recruitment_methods_described": { 194 "applies": true, 195 "answer": false, 196 "justification": "The paper mentions 'trained evaluators' and 'health experts' but does not describe how they were recruited, how many there were, or their qualifications." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The pipeline from raw MentalChat16K to 5000 sampled rows to trained models to evaluation is described at a high level but lacks detail on sampling strategy, data cleaning, and intermediate steps." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding disclosure or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All authors are listed as University of California, Irvine affiliates. The evaluation framework is from the Institute for Future Health (ref [16]), which is Rahmani's lab at UCI — this connection is implicit but affiliations are stated." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding information is disclosed, so independence cannot be assessed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial disclosure statement is present." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff date is stated for LLaMA 3.2. The evaluation queries could have been seen during pre-training." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether the 100 evaluation queries or their answers overlap with LLaMA 3.2's pre-training data or MentalChat16K." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "The evaluation framework (ref [16]) was published before LLaMA 3.2's training. No discussion of whether benchmark content could be in the training data." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in the study. Evaluators score model outputs but are not study subjects." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The study evaluates model outputs, not human subjects." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in the study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in the study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "The paper discusses efficiency advantages qualitatively (1B vs 3B, on-device processing) but reports no actual inference costs, latency measurements, or tokens consumed." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No GPU hours, training time, or computational budget is reported despite training multiple model variants." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of multiple random seeds or seed sensitivity analysis for training or evaluation." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "It is not stated how many runs produced the reported results. Appears to be single-run." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search budget is reported. Only '5000 rows' and '2 response pairs' are mentioned with no search process." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "No discussion of how the final configuration was selected or whether other configurations were tried." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors evaluate their own training approach using an evaluation framework from a co-author's lab (Institute for Future Health, ref [16]) without acknowledging this potential bias." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "The 1B vs 3B comparison implies different compute but no analysis of performance as a function of compute budget is provided." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The evaluation uses 5 guidelines from ref [16] without discussing whether these guidelines adequately capture 'safety' in mental health chatbots or whether the 1-10 scoring rubric has construct validity." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved. Models are evaluated directly." 337 } 338 } 339 }, 340 "claims": [ 341 { 342 "claim": "Specific constitutional principles yield 31.7% higher total scores than vague/general principles (24.08 vs 18.29)", 343 "evidence": "Table II shows total scores: vague/general 18.29, specific 24.08. Section III.B provides per-guideline analysis.", 344 "supported": "moderate" 345 }, 346 { 347 "claim": "1B model with specific principles outperforms 3B model without constitutional training (24.08 vs 19.92)", 348 "evidence": "Table II: No principle 3B scores 19.92, Specific principles 1B scores 24.08.", 349 "supported": "moderate" 350 }, 351 { 352 "claim": "Crisis-related guidelines show the largest relative improvements (>150%)", 353 "evidence": "Section III.B: Guideline 3 from 1.06 to 2.69 (153.8%), Guideline 4 from 1.13 to 2.91 (157.5%).", 354 "supported": "moderate" 355 }, 356 { 357 "claim": "Ablation replacing 2 of 4 specific principles reduces total score by 19.2%", 358 "evidence": "Table III and Section III.D: specific 24.08 vs ablated 19.45.", 359 "supported": "moderate" 360 } 361 ], 362 "red_flags": [ 363 { 364 "flag": "No statistical tests", 365 "detail": "All comparisons are raw score differences with no significance testing. With only point estimates and no variance, it is impossible to tell if differences are meaningful or due to noise." 366 }, 367 { 368 "flag": "No variance or uncertainty reporting", 369 "detail": "No standard deviations, confidence intervals, or multi-run results. Single-run training and evaluation with no indication of result stability." 370 }, 371 { 372 "flag": "Evaluator details missing", 373 "detail": "'Trained evaluators' score responses but no information on how many evaluators, their qualifications, inter-rater reliability, or blinding to condition." 374 }, 375 { 376 "flag": "Self-evaluation with own framework", 377 "detail": "The evaluation framework (ref [16]) comes from the Institute for Future Health, which is co-author Rahmani's lab at UCI. The authors are evaluating their own approach with their own lab's evaluation instrument." 378 }, 379 { 380 "flag": "Overclaiming", 381 "detail": "Conclusions extend to 'broader investigation of domain-specific CAI in healthcare specialties' and 'standardization of clinical AI principles' from a single experiment on one model family with 100 queries and no statistical tests." 382 }, 383 { 384 "flag": "Low absolute scores on critical guidelines", 385 "detail": "Crisis-related guidelines (3 and 4) score 2.69 and 2.91 out of 10 even with specific principles — still very poor absolute performance that is not adequately discussed." 386 } 387 ], 388 "cited_papers": [ 389 { 390 "title": "A comprehensive survey of LLM alignment techniques: RLHF, RLAIF, PPO, DPO and more", 391 "authors": ["Z. Wang"], 392 "year": 2024, 393 "arxiv_id": "2407.16216", 394 "relevance": "Comprehensive survey of LLM alignment techniques directly relevant to AI safety methodology." 395 }, 396 { 397 "title": "Foundational challenges in assuring alignment and safety of large language models", 398 "authors": ["U. Anwar"], 399 "year": 2024, 400 "arxiv_id": "2404.09932", 401 "relevance": "Identifies foundational challenges in LLM alignment and safety assurance." 402 }, 403 { 404 "title": "Sleeper agents: Training deceptive LLMs that persist through safety training", 405 "authors": ["E. Hubinger"], 406 "year": 2024, 407 "arxiv_id": "2401.05566", 408 "relevance": "Demonstrates that deceptive behaviors can persist through safety training, relevant to AI safety evaluation." 409 }, 410 { 411 "title": "Constitutional AI: Harmlessness from AI feedback", 412 "authors": ["Y. Bai"], 413 "year": 2022, 414 "arxiv_id": "2212.08073", 415 "relevance": "Foundational paper on Constitutional AI methodology that this paper extends to domain-specific settings." 416 }, 417 { 418 "title": "Specific versus general principles for constitutional AI", 419 "authors": ["S. Kundu"], 420 "year": 2023, 421 "arxiv_id": "2310.13798", 422 "relevance": "Directly compares specific vs general CAI principles — key prior work this paper builds on." 423 }, 424 { 425 "title": "How effective is constitutional AI in small LLMs? A study on DeepSeek-R1 and its peers", 426 "authors": ["A.-G. C. Menke", "P. X. Tan"], 427 "year": 2025, 428 "arxiv_id": "2503.17365", 429 "relevance": "Studies CAI effectiveness in small LLMs, directly related to this paper's efficiency claims." 430 }, 431 { 432 "title": "C3AI: Crafting and evaluating constitutions for constitutional AI", 433 "authors": ["Y. Kyrychenko", "K. Zhou", "E. Bogucka", "D. Quercia"], 434 "year": 2025, 435 "relevance": "Framework for crafting and evaluating AI constitutions, relevant to AI alignment methodology." 436 }, 437 { 438 "title": "Building guardrails for large language models", 439 "authors": ["Y. Dong"], 440 "year": 2024, 441 "arxiv_id": "2402.01822", 442 "relevance": "Survey of LLM guardrail approaches relevant to AI safety in deployment." 443 } 444 ] 445 }