ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (25348B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Domain-Specific Constitutional AI: Enhancing Safety in LLM-Powered Mental Health Chatbots",
      6     "authors": [
      7       "Chenhan Lyu",
      8       "Yutong Song",
      9       "Pengfei Zhang",
     10       "Amir M. Rahmani"
     11     ],
     12     "year": 2025,
     13     "venue": "International Conference on Wearable and Implantable Body Sensor Networks",
     14     "arxiv_id": "2509.16444",
     15     "doi": "10.1109/BSN66969.2025.11337405"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims about CAI improving safety are supported by experimental results shown in Tables II–III. Methodology for principle derivation is described and evaluation framework is established.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Four-condition experimental design (no CAI, vague CAI, specific CAI, larger model) supports causal claims about principle effects. Ablation study (Table III) further isolates contribution of specificity.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Claims bounded to mental health chatbots. Evaluation uses 100 queries on common scenarios (depression, anxiety, crises). Applicability to other medical specialties framed as future work.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Ablation compares specific vs vague principles but doesn't explore whether domain-specificity itself matters versus general specificity. No control comparing mental-health principles to domain-specific principles from another field.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Paper claims improvements in 'safety' and 'effectiveness' but measures evaluator-scored responses against five rubric guidelines. No discussion of whether these scores translate to actual harm reduction or real-world safety.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No dedicated limitations or threats-to-validity section. Single-sentence mention in conclusion ('static principles may not adapt to evolving guidelines') does not constitute structured limitations discussion.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Threats to validity not specifically discussed. No inter-rater agreement metrics, sample size justification for 100 queries, or evaluator bias analysis. Evaluator qualifications vaguely described as 'trained evaluators' and 'health experts'.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Scope boundaries not explicitly stated. No discussion of which model sizes apply, which mental health conversation types were tested, or what scenarios were excluded.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding source disclosed. No acknowledgments section or grant information provided. Absence of disclosure is a red flag.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All authors listed as UC Irvine. Developing a method rather than evaluating proprietary product, so no direct conflict.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No funder identified; cannot assess independence.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or financial disclosures provided.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Constitutional AI explained as 'self-critique and revision guided by explicit principles.' Domain-specific principles illustrated in Table I with concrete examples (e.g., 'Use professional help for serious mental health concerns').",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Three explicit contributions stated: (1) domain-specific principle design, (2) quantitative evaluation comparing principles, (3) demonstration that smaller aligned models outperform larger unaligned models.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Introduction engages prior CAI work (Bai et al.), specific-vs-general debate (Kundu et al.), and identifies gap: 'no research has compared constitutional principles explicitly derived from domain-specific mental health guidelines.' Clear positioning relative to existing work.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No code repository, GitHub link, or implementation details provided. Methods are conceptual, not reproducible from paper.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Training dataset MentalChat16K publicly available on HuggingFace (reference [15]). Dataset is accessible.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Model architecture specified (LLaMA 3.2, 1B and 3B) but no requirements.txt, dependency versions, Python version, or environment specifications.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Methods describe CAI training conceptually but provide no step-by-step reproduction instructions or training script sufficient to reimplement.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Table II and Figure 2 report single point estimates (e.g., 6.47, 5.50) with no confidence intervals, standard deviations, or error bars.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "Multiple comparative claims made (e.g., '46.7% increase', '31.7% advantage') but no statistical significance tests, p-values, or hypothesis tests reported.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Percentage improvements reported (e.g., 'Guideline 1 improves 4.41→6.47, 46.7% increase'). Baseline context provided.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "5000 rows sampled for training, 100 queries for evaluation. No sample size justification or power analysis.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Single scores per model per guideline with no standard deviation, range, or indication of variance across runs.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Four conditions: (1) no CAI baseline, (2) vague CAI, (3) specific CAI, (4) larger 3B model without CAI. Multiple baselines for comparison.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": false,
    187           "justification": "Baselines are only internal LLaMA 3.2 variants. No comparison to published mental health chatbots or other safety training methods (RLHF, DPO).",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Section III.D ablation: replacing two specific principles with vague ones (24.08→19.45) isolates contribution of principle specificity.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Five evaluation guidelines used (Table I), each scored 1–10. Per-guideline breakdowns in Table II and Figure 2.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "Trained evaluators scored responses 1–10 using detailed rubrics aligned with clinical best practices. Health experts provided ground-truth responses.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": false,
    211           "justification": "100 evaluation queries used but no explicit confirmation they are held-out from 5000 training examples. Both from MentalChat16K; train-test split not documented.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Table II breaks scores by five guidelines. Figure 2 provides per-guideline bar charts. Figure 3 includes radar visualization.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": false,
    223           "justification": "No failure cases shown or analyzed. No qualitative error analysis or discussion of underperformance scenarios.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": false,
    229           "justification": "Proposed method shows improvements in all comparisons. Ablation (vague vs specific) supports positive claim but is not framed as independent negative result.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "Model family specified (LLaMA 3.2, 1B and 3B) but no exact checkpoint version, snapshot date, or training cutoff.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": false,
    243           "justification": "Conceptual SFT template given ('Critique this response against these principles: [principle text]'). Table I shows principles but complete RLAIF prompts not provided.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "Only sparse hyperparameters: 5000 samples, 2 response pairs per example, 'early stopping.' No learning rate, batch size, optimizer, epochs, or stopping criteria.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "CAI scaffolding described: two-phase training (SFT for self-critique + RLAIF), chain-of-thought reasoning about principle conformance before revision.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": false,
    261           "justification": "Only 'sampling 5000 rows' mentioned. No filtering criteria, data cleaning steps, or preprocessing documented.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "MentalChat16K publicly available on HuggingFace. Expert ground-truth responses not released but evaluation uses expert-provided benchmarks.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": false,
    275           "justification": "Paper uses external MentalChat16K but does not document its collection. Details are in reference [15], not this paper.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": true,
    280           "answer": false,
    281           "justification": "Evaluators ('trained evaluators', 'health experts') not characterized. No number of evaluators, expertise criteria, or recruitment process specified.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": false,
    287           "justification": "High-level pipeline stated (sample 5000 → SFT → RLAIF → evaluate 100 queries) but no detailed filtering logic, preprocessing, or sampling procedure documented.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "LLaMA 3.2 pretraining cutoff date not stated. Matters for whether evaluation queries could be in pretraining data.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "No discussion of overlap between 5000 fine-tuning examples and 100 evaluation queries. Both from MentalChat16K; no confirmation of train-test separation.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "No contamination analysis between pretraining data and evaluation set. No discussion of MentalChat16K's timing relative to LLaMA 3.2 pretraining.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human subject research; only model evaluation with human raters.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants; evaluation uses expert raters, not subject research.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participant demographics; evaluators only.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "Not applicable; model evaluation, not human subject research.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "Not applicable; no human randomization.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "Not applicable; no human participant blinding.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "Not applicable; no human participant attrition.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No inference latency, cost, or computational requirements reported. Paper motivates resource-constrained settings but provides no actual metrics.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "No training time, GPU hours, or computational budget reported.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Domain-specific constitutional principles improve mental health chatbot safety by 31.7% compared to vague general principles",
    374       "evidence": "Table II: specific principles total score 24.08 vs vague/general 18.29; ablation (Table III) confirms 19.2% reduction when specificity removed",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "1B-parameter models trained with domain-specific CAI outperform unprincipled 3B models",
    379       "evidence": "Table II: 1B specific (24.08) > 3B no-CAI (19.92); discussion claims smaller principled models consistently outperform larger unprincipled ones",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Specific constitutional principles deliver exceptional improvements for crisis intervention (153–158% on crisis guidelines)",
    384       "evidence": "Table II Guidelines 3 and 4: baseline 1.06→2.69 (153.8%), 1.13→2.91 (157.5%); ablation confirms vague principles underperform on crisis response (Table III)",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Explicit mental health-specific principles are essential; vague principles allow interpretive flexibility causing inconsistent crisis responses",
    389       "evidence": "Discussion: 'Vague/general formulations allow interpretive flexibility...leading to inconsistent outputs.' Ablation shows performance loss with vague principles.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Domain-specific CAI enables practical deployment in resource-constrained healthcare environments",
    394       "evidence": "1B model with specific CAI outperforms 3B unaligned; discussion motivates healthcare deployment. However, no actual cost/latency/resource metrics provided.",
    395       "supported": "weak"
    396     }
    397   ],
    398   "methodology_tags": [
    399     "benchmark-eval"
    400   ],
    401   "key_findings": "Constitutional AI training with domain-specific mental health principles significantly improves safety metrics (24.08 total score) over no CAI (13.74) and vague principles (18.29). A 1B-parameter model trained with specific principles outperforms an unprincipled 3B model, suggesting principled alignment may matter more than scale for constrained healthcare settings. Crisis intervention showed the largest gains (153–158% on crisis guidelines), indicating explicit resource provision and professional referral principles are critical for high-stakes scenarios.",
    402   "red_flags": [
    403     {
    404       "flag": "No statistical significance testing",
    405       "detail": "Improvements reported as percentages without p-values or confidence intervals. Cannot determine if 46.7% gains on individual guidelines are statistically significant or noise."
    406     },
    407     {
    408       "flag": "No inter-rater reliability reported",
    409       "detail": "Human evaluators scored outputs but no inter-rater agreement metrics (Kappa, ICC) provided. Evaluator disagreement could dominate claimed effect sizes."
    410     },
    411     {
    412       "flag": "Evaluators not characterized",
    413       "detail": "Described only as 'trained evaluators' and 'health experts.' Number of raters, expertise level, training process, and eligibility criteria not specified."
    414     },
    415     {
    416       "flag": "Small evaluation set without justification",
    417       "detail": "Only 100 mental health queries evaluated. No sample size justification, power analysis, or coverage analysis of mental health scenario diversity."
    418     },
    419     {
    420       "flag": "No comparison to published baselines",
    421       "detail": "Only compares internal variants of LLaMA 3.2. No comparison to published mental health chatbots or alternative safety methods (RLHF, DPO)."
    422     },
    423     {
    424       "flag": "Safety claims not tied to real-world outcomes",
    425       "detail": "Claims 'safety improvements' but measures evaluator scores against rubrics. No evidence scores translate to reduced harm, accurate diagnoses, or better clinical outcomes."
    426     },
    427     {
    428       "flag": "Code and hyperparameters not disclosed",
    429       "detail": "Implementation not released. Sparse hyperparameters (no learning rate, batch size, optimizer, stopping criteria) make independent replication infeasible."
    430     },
    431     {
    432       "flag": "No variance or uncertainty quantification",
    433       "detail": "Single point estimates reported. No error bars, standard deviations, or indication of run-to-run variance. Unclear if single training run or averaged over multiple seeds."
    434     },
    435     {
    436       "flag": "Train-test contamination not addressed",
    437       "detail": "Both 5000 training examples and 100 evaluation queries from MentalChat16K. No confirmation of held-out evaluation set or overlap analysis."
    438     },
    439     {
    440       "flag": "No funding disclosure",
    441       "detail": "No acknowledgments or funding source stated. Raises questions about potential undisclosed support or institutional constraints."
    442     }
    443   ],
    444   "cited_papers": [
    445     {
    446       "title": "Large language models for mental health applications: Systematic review",
    447       "relevance": "Systematic review of LLM mental health applications; establishes domain landscape and motivates domain-specific safety"
    448     },
    449     {
    450       "title": "The opportunities and risks of large language models in mental health",
    451       "relevance": "Reviews LLM opportunities and risks in mental health; motivates need for specialized guardrails beyond generic AI safety"
    452     },
    453     {
    454       "title": "Constitutional ai: Harmlessness from ai feedback",
    455       "relevance": "Foundational Constitutional AI methodology that this paper adapts and builds upon"
    456     },
    457     {
    458       "title": "Specific versus general principles for constitutional ai",
    459       "relevance": "Directly relevant prior work comparing principle specificity in CAI; this paper extends to domain-specific principles"
    460     },
    461     {
    462       "title": "A comprehensive survey of llm alignment techniques",
    463       "relevance": "Surveys alignment methods including RLAIF used in the paper's training pipeline"
    464     },
    465     {
    466       "title": "Building guardrails for large language models",
    467       "relevance": "Relevant to guardrail design and safety constraints for LLM deployment"
    468     },
    469     {
    470       "title": "Building trust in mental health chatbots: Safety metrics and llm-based evaluation tools",
    471       "relevance": "Directly addresses safety metrics and evaluation frameworks for mental health chatbots"
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 2,
    477       "justification": "Mental health chatbots have direct healthcare application; LLaMA 3.2 is publicly available. However, evaluation is synthetic, not real-world deployment."
    478     },
    479     "surprise_contrarian": {
    480       "score": 1,
    481       "justification": "Smaller models beating larger models is somewhat notable, but domain-specific principles outperforming generic ones is expected and incremental."
    482     },
    483     "fear_safety": {
    484       "score": 2,
    485       "justification": "Mental health AI safety is a legitimate concern; paper highlights risks (misdiagnosis, harm escalation) but does not definitively resolve them."
    486     },
    487     "drama_conflict": {
    488       "score": 0,
    489       "justification": "Mental health is sensitive but paper is methodical and technical; no controversial findings or conflict angles."
    490     },
    491     "demo_ability": {
    492       "score": 1,
    493       "justification": "Uses public LLaMA 3.2 and MentalChat16K, but code not released; reimplementation from scratch would be required."
    494     },
    495     "brand_recognition": {
    496       "score": 1,
    497       "justification": "UC Irvine is known but not a top-tier AI lab. IEEE BSN is a specialized venue with lower visibility than major conferences."
    498     }
    499   },
    500   "hn_data": {
    501     "threads": [
    502       {
    503         "hn_id": "41671808",
    504         "title": "First Past the Post: Evaluating Query Optimization in MongoDB",
    505         "points": 4,
    506         "comments": 0,
    507         "url": "https://news.ycombinator.com/item?id=41671808",
    508         "created_at": "2024-09-27T15:36:58Z"
    509       },
    510       {
    511         "hn_id": "45220460",
    512         "title": "Perihelion precession of planetary orbits solved from quantum field theory",
    513         "points": 3,
    514         "comments": 4,
    515         "url": "https://news.ycombinator.com/item?id=45220460",
    516         "created_at": "2025-09-12T09:48:24Z"
    517       },
    518       {
    519         "hn_id": "45302119",
    520         "title": "VCBench: Benchmarking LLMs in Venture Capital",
    521         "points": 1,
    522         "comments": 0,
    523         "url": "https://news.ycombinator.com/item?id=45302119",
    524         "created_at": "2025-09-19T14:32:42Z"
    525       }
    526     ],
    527     "top_points": 4,
    528     "total_points": 8,
    529     "total_comments": 4
    530   }
    531 }

Impressum · Datenschutz