scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (30996B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Domain-Specific Constitutional AI: Enhancing Safety in LLM-Powered Mental Health Chatbots",
      6     "authors": [
      7       "Chenhan Lyu",
      8       "Yutong Song",
      9       "Pengfei Zhang",
     10       "Amir M. Rahmani"
     11     ],
     12     "year": 2025,
     13     "venue": "IEEE BSN 2025",
     14     "arxiv_id": "2509.16444",
     15     "doi": "10.1109/BSN66969.2025.11337405"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract claims an approach for domain-specific CAI in mental health, and the results demonstrate improvements across all five guidelines. The quantitative evaluation claim is supported by Tables II-III.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper makes causal claims ('specific principles yield significant safety enhancements'). The four-condition experimental design with controlled single-variable manipulation (principle type) provides adequate support for these claims, though lack of statistical tests weakens confidence.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper title claims 'Enhancing Safety in LLM-Powered Mental Health Chatbots' broadly, but results are only on LLaMA 3.2 1B/3B with a single evaluation framework and 100 queries. Discussion section extends claims to 'diverse health domains' and 'medical specialties' without evidence.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No alternative explanations are considered. For example, improvements could stem from increased training data exposure rather than principle specificity, or the evaluation rubric could favor specific-principle language patterns.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper measures evaluator scores on 5 guidelines and frames this as 'safety enhancement' and 'effectiveness.' No discussion of whether rubric scores are adequate proxies for actual clinical safety outcomes.",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "There is no dedicated limitations section. The conclusion contains one sentence: 'While effective for current safety needs, our static principles may not adapt to evolving guidelines.' This is insufficient for a dedicated section.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The only limitation mentioned (static principles) is generic. No discussion of specific threats like small evaluation set, single model family, evaluator reliability, or single evaluation framework.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No explicit scope boundaries. The paper does not state what it did NOT test (other model families, other languages, real clinical settings, etc.).",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding disclosure or acknowledgments section is present in the paper.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All authors are listed as University of California, Irvine affiliates. The evaluation framework is from the Institute for Future Health (ref [16]), which is Rahmani's lab at UCI — this connection is implicit but affiliations are stated.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No funding information is disclosed, so independence cannot be assessed.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial disclosure statement is present.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "Key terms used loosely: 'safety' used throughout without precise definition (conflated with guideline adherence). Constitutional AI is briefly defined, but 'domain-specific principles' and 'mental health applications' lack precise definitions.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Three contributions explicitly stated: (1) domain-specific constitutional principles design, (2) quantitative comparison against baselines, (3) exploration of smaller models matching larger ones.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "Paper cites relevant work but engagement is superficial—mostly literature listing. Claims gap in research (no prior work comparing domain-specific vs. general CAI principles in mental health) but does not deeply situate contribution relative to existing CAI frameworks.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No repository URL, code archive, or link to training code is provided anywhere in the paper.",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "The paper uses MentalChat16K, a publicly available dataset on HuggingFace (ref [15]), and the evaluation framework from ref [16]. The training data source is publicly accessible.",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No environment specifications, dependency lists, or hardware details are provided. The paper mentions LLaMA 3.2 1B/3B architectures but provides no setup information.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No reproduction instructions, README, or step-by-step guide is provided.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "All results in Tables II and III are reported as point estimates with no confidence intervals or error bars.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "The paper claims specific principles outperform vague/general principles and baselines but provides no statistical significance tests. Comparisons are made solely by comparing raw score numbers.",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Percentage improvements are reported with baseline context throughout Section III.B, e.g., 'improving from 4.41 in the baseline to 6.47 with specific principles (46.7% increase)' and similar for each guideline.",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "100 evaluation queries and 5000 training rows are used but no justification for these sample sizes is given, nor any power analysis.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No variance, standard deviation, or spread measures are reported for any results. It is unclear whether evaluations were run multiple times.",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Four conditions compared: no principle baseline (1B), vague/general principles (1B), specific principles (1B), and no principle (3B). Section II.B describes the experimental design.",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Uses LLaMA 3.2 (2024) as the base architecture and compares against the original Anthropic CAI approach with general principles. The baselines are appropriate for the comparison being made.",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Section III.D presents an ablation study replacing two of four specific principles with vague counterparts, showing the contribution of principle specificity.",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Five separate guideline metrics are reported (practice adherence, health risks, critical response, resource provision, user empowerment) plus a total score.",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "Section III.A states 'Trained evaluators scored responses on a 1-10 scale per guideline using detailed rubrics aligned with clinical best practices.' Health experts provided ground truth.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": false,
    211           "justification": "The evaluation uses 100 mental health queries but there is no mention of train/test/validation splits for the evaluation set, or whether the evaluation queries overlap with training data.",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Tables II and III and Figure 2 provide per-guideline breakdowns across all five evaluation guidelines for each model variant.",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": false,
    223           "justification": "No failure cases, error analysis, or qualitative examples of where the models still fail are discussed.",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": false,
    229           "justification": "Every comparison shows monotonic improvement from baseline to specific principles. The ablation shows the ablated version underperforms specific but still improves over vague — no configurations that hurt performance are reported.",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Specific model sizes are stated: 'the 1B parameter LLaMA 3.2 architecture' and 'the 3B parameter LLaMA 3.2 architecture' (Section II.B). LLaMA 3.2 is specific enough as a model release.",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": false,
    243           "justification": "Only a template format is mentioned: 'standardized prompting templates (e.g., \"Critique this response against these principles: [principle text].\")'. The full actual prompts used for SFT and RLAIF phases are not provided.",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "Training used 5000 rows with early stopping and 2 response pairs per example for preference generation, but no learning rates, batch sizes, optimizer, temperature, number of epochs, or other standard hyperparameters are reported.",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is used. The paper trains models via SFT and RLAIF — standard fine-tuning, not agentic workflows.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": false,
    261           "justification": "The paper states 'sampling 5000 rows' from MentalChat16K but does not describe how those rows were selected, whether any filtering was applied, or how the data was preprocessed for training.",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "Individual evaluation scores per query are not available. Only aggregated per-guideline means are reported in Tables II-III.",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section III.A describes the evaluation framework: 100 mental health queries, health expert ground truth, trained evaluators scoring on 1-10 rubrics. Training data comes from MentalChat16K (ref [15]).",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": true,
    280           "answer": false,
    281           "justification": "The paper mentions 'trained evaluators' and 'health experts' but does not describe how they were recruited, how many there were, or their qualifications.",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": false,
    287           "justification": "The pipeline from raw MentalChat16K to 5000 sampled rows to trained models to evaluation is described at a high level but lacks detail on sampling strategy, data cleaning, and intermediate steps.",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "No training data cutoff date is stated for LLaMA 3.2. The evaluation queries could have been seen during pre-training.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "No discussion of whether the 100 evaluation queries or their answers overlap with LLaMA 3.2's pre-training data or MentalChat16K.",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "The evaluation framework (ref [16]) was published before LLaMA 3.2's training. No discussion of whether benchmark content could be in the training data.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in the study. Evaluators score model outputs but are not study subjects.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants. The study evaluates model outputs, not human subjects.",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in the study.",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in the study.",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in the study.",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in the study.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in the study.",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "The paper discusses efficiency advantages qualitatively (1B vs 3B, on-device processing) but reports no actual inference costs, latency measurements, or tokens consumed.",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "No GPU hours, training time, or computational budget is reported despite training multiple model variants.",
    366           "source": "opus"
    367         }
    368       },
    369       "experimental_rigor": {
    370         "seed_sensitivity_reported": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "No mention of multiple random seeds or seed sensitivity analysis for training or evaluation.",
    374           "source": "opus"
    375         },
    376         "number_of_runs_stated": {
    377           "applies": true,
    378           "answer": false,
    379           "justification": "It is not stated how many runs produced the reported results. Appears to be single-run.",
    380           "source": "opus"
    381         },
    382         "hyperparameter_search_budget": {
    383           "applies": true,
    384           "answer": false,
    385           "justification": "No hyperparameter search budget is reported. Only '5000 rows' and '2 response pairs' are mentioned with no search process.",
    386           "source": "opus"
    387         },
    388         "best_config_selection_justified": {
    389           "applies": true,
    390           "answer": false,
    391           "justification": "No discussion of how the final configuration was selected or whether other configurations were tried.",
    392           "source": "opus"
    393         },
    394         "multiple_comparison_correction": {
    395           "applies": false,
    396           "answer": false,
    397           "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable.",
    398           "source": "opus"
    399         },
    400         "self_comparison_bias_addressed": {
    401           "applies": true,
    402           "answer": false,
    403           "justification": "The authors evaluate their own training approach using an evaluation framework from a co-author's lab (Institute for Future Health, ref [16]) without acknowledging this potential bias.",
    404           "source": "opus"
    405         },
    406         "compute_budget_vs_performance": {
    407           "applies": true,
    408           "answer": false,
    409           "justification": "The 1B vs 3B comparison implies different compute but no analysis of performance as a function of compute budget is provided.",
    410           "source": "opus"
    411         },
    412         "benchmark_construct_validity": {
    413           "applies": true,
    414           "answer": false,
    415           "justification": "The evaluation uses 5 guidelines from ref [16] without discussing whether these guidelines adequately capture 'safety' in mental health chatbots or whether the 1-10 scoring rubric has construct validity.",
    416           "source": "opus"
    417         },
    418         "scaffold_confound_addressed": {
    419           "applies": false,
    420           "answer": false,
    421           "justification": "No scaffolding is involved. Models are evaluated directly.",
    422           "source": "opus"
    423         }
    424       }
    425     }
    426   },
    427   "claims": [
    428     {
    429       "claim": "Domain-specific mental health principles improve safety scores by 31.7% over general ethical principles",
    430       "evidence": "Specific principles model: 24.08 total score vs. vague/general: 18.29. Ablation study shows 19.2% reduction when specificity removed.",
    431       "supported": "moderate"
    432     },
    433     {
    434       "claim": "Smaller models trained with specific CAI principles outperform larger unprincipled models",
    435       "evidence": "1B model with specific principles (24.08) > 3B baseline without CAI (19.92). But 1B model is trained while 3B is not—confounds size with training approach.",
    436       "supported": "weak"
    437     },
    438     {
    439       "claim": "Constitutional AI training improves crisis response consistency in mental health chatbots",
    440       "evidence": "Guideline 3 (crisis response) improved from 1.06 → 2.69 (153.8% gain). Guideline 4 (resources) from 1.13 → 2.91 (157.5%). Absolute scores remain very low (2.69/10, 2.91/10).",
    441       "supported": "moderate"
    442     },
    443     {
    444       "claim": "Explicit mental health-derived principles are essential for robust crisis alignment",
    445       "evidence": "Ablation study shows 19.2% total reduction when specificity removed. Guidelines 3 & 4 (crisis-specific) most sensitive to principle specificity.",
    446       "supported": "moderate"
    447     },
    448     {
    449       "claim": "General ethical principles are inadequate without domain-specific tailoring",
    450       "evidence": "Vague/general principles yield 18.29 vs. specific 24.08 (31.7% gap). Only 24.7% improvement from no CAI to general CAI, vs. 75.5% from no CAI to specific.",
    451       "supported": "moderate"
    452     },
    453     {
    454       "claim": "The approach generalizes to other medical specialties via principle extraction from guidelines",
    455       "evidence": "No evidence provided. Discussion states approach is 'adaptable to medical specialties' but this is aspirational, not empirically demonstrated.",
    456       "supported": "unsupported"
    457     }
    458   ],
    459   "methodology_tags": [
    460     "benchmark-eval",
    461     "empirical"
    462   ],
    463   "key_findings": "Domain-specific constitutional AI principles derived from mental health guidelines improve LLM alignment on mental health tasks by 31.7% compared to generic ethical principles. A 1B parameter model trained with specific principles outperforms an untrained 3B model, suggesting that principled alignment training can overcome scale limitations for resource-constrained healthcare deployment. Crisis intervention and resource provision show the largest relative improvements, rising 153–157%, though absolute scores remain low (2.7–2.9 out of 10).",
    464   "red_flags": [
    465     {
    466       "flag": "Inadequate statistical rigor",
    467       "detail": "No confidence intervals, error bars, or significance tests on 100-query evaluation set. Point estimates only. Cannot assess whether differences are statistically significant or clinically meaningful."
    468     },
    469     {
    470       "flag": "Small, unjustified sample size",
    471       "detail": "Only 100 test queries for a safety-critical mental health application. No power analysis or justification. In mental health AI, 100 scenarios may miss rare but critical failure modes."
    472     },
    473     {
    474       "flag": "Evaluation metric not validated",
    475       "detail": "Custom 5-guideline evaluation framework from 'Institute for Future Health' [16] is not described or validated against gold standards. No inter-rater reliability (kappa, ICC) reported for human evaluation."
    476     },
    477     {
    478       "flag": "Overstated safety claims",
    479       "detail": "Abstract claims address 'crisis intervention accuracy', 'therapeutic guideline adherence', and 'severe outcomes such as self-harm or loss of trust', but evaluation is synthetic queries on a custom rubric, not real crisis scenarios."
    480     },
    481     {
    482       "flag": "Proxy outcome gap",
    483       "detail": "Paper claims to enhance 'safety' but measures 'guideline adherence'. Guidelines are necessary but not sufficient for actual safety—a chatbot can follow guidelines yet harm vulnerable users through nuanced failures."
    484     },
    485     {
    486       "flag": "Confounded size comparison",
    487       "detail": "Claims 1B trained model > 3B untrained model. But 1B receives CAI training while 3B does not. Cannot isolate whether improvement comes from principles or training. Need 3B with specific training to disentangle."
    488     },
    489     {
    490       "flag": "Alternative explanations not discussed",
    491       "detail": "Improvement could partly stem from domain-specific fine-tuning on MentalChat16K, chain-of-thought prompting format, or the evaluation metric's sensitivity. Single mechanism (specific principles) assumed without ablating other factors."
    492     },
    493     {
    494       "flag": "Generalization claims without evidence",
    495       "detail": "Conclusion states approach is 'adaptable to medical specialties via principle extraction from guidelines' but no other specialties tested. Risk of overgeneralization from mental health domain to surgery, oncology, etc."
    496     },
    497     {
    498       "flag": "No real-world validation",
    499       "detail": "Evaluation on synthetic queries only. No deployment with actual users, no measurement of real crisis intervention outcomes, no feedback from mental health professionals using the system."
    500     },
    501     {
    502       "flag": "No code or full reproducibility",
    503       "detail": "No source code released. SFT and RLAIF prompts not fully specified. Hyperparameters missing (learning rate, batch size, temperature, etc.). Code availability critical for CAI reproducibility and principle adaptation."
    504     },
    505     {
    506       "flag": "Low absolute performance on critical tasks",
    507       "detail": "Guideline 3 (crisis response consistency): 2.69/10. Guideline 4 (resource provision): 2.91/10. Absolute scores remain very low in the domain where failure is highest-stakes."
    508     },
    509     {
    510       "flag": "Test set held-out status unclear",
    511       "detail": "No explicit statement on whether 100 test queries are held-out from 5000-row training sample. Train/test split not documented. Risk of data leakage."
    512     }
    513   ],
    514   "cited_papers": [
    515     {
    516       "title": "Constitutional AI: Harmlessness from AI Feedback",
    517       "authors": "Bai et al.",
    518       "year": 2022,
    519       "relevance": "Foundational CAI methodology. Directly extended by this work to mental health domain."
    520     },
    521     {
    522       "title": "Specific versus General Principles for Constitutional AI",
    523       "authors": "Kundu et al.",
    524       "year": 2023,
    525       "relevance": "Prior work comparing specific vs. general CAI principles. This paper extends finding to mental health context."
    526     },
    527     {
    528       "title": "Large Language Models for Mental Health Applications: Systematic Review",
    529       "authors": "Guo et al.",
    530       "year": 2024,
    531       "relevance": "Systematic review of LLMs in mental health. Provides domain context and identifies safety gaps this paper aims to address."
    532     },
    533     {
    534       "title": "The Opportunities and Risks of Large Language Models in Mental Health",
    535       "authors": "Lawrence et al.",
    536       "year": 2024,
    537       "relevance": "Framework for LLM risks in mental health (crisis intervention, therapeutic accuracy). Motivates domain-specific alignment approaches."
    538     },
    539     {
    540       "title": "A Scoping Review of Large Language Models for Generative Tasks in Mental Health Care",
    541       "authors": "Hua et al.",
    542       "year": 2025,
    543       "relevance": "Recent scoping review of generative LLM applications in mental health. Establishes landscape for domain-specific safety work."
    544     },
    545     {
    546       "title": "How Effective is Constitutional AI in Small LLMs?",
    547       "authors": "Menke & Tan",
    548       "year": 2025,
    549       "relevance": "Recent study on CAI effectiveness in small models. Directly related to this paper's claim that smaller models can be principled."
    550     },
    551     {
    552       "title": "MentalChat16K: A Benchmark Dataset for Conversational Mental Health Assistance",
    553       "authors": "Xu et al.",
    554       "year": 2024,
    555       "relevance": "Dataset used for training. Benchmark for evaluating mental health chatbots."
    556     },
    557     {
    558       "title": "Building Trust in Mental Health Chatbots: Safety Metrics and LLM-Based Evaluation Tools",
    559       "authors": "Park et al.",
    560       "year": 2024,
    561       "relevance": "Defines evaluation framework (5 guidelines) used in this paper. Safety metrics and clinical rubrics for mental health AI."
    562     }
    563   ],
    564   "engagement_factors": {
    565     "practical_relevance": {
    566       "score": 1,
    567       "justification": "Framework presented but no released code, prompts, or implementation. Practitioners cannot directly apply without significant reverse-engineering."
    568     },
    569     "surprise_contrarian": {
    570       "score": 1,
    571       "justification": "Confirms existing finding that specific > general CAI principles (Kundu et al. 2023). No surprising or contrarian insight into mental health AI challenges."
    572     },
    573     "fear_safety": {
    574       "score": 2,
    575       "justification": "Mental health AI is high-stakes domain where misalignment causes direct harm (crisis escalation, therapeutic failure). Paper addresses this but only via synthetic evaluation, not demonstrating real safety."
    576     },
    577     "drama_conflict": {
    578       "score": 0,
    579       "justification": "No conflict or controversy. Uncontested finding that domain-specific alignment helps. No debate with related work or negative results."
    580     },
    581     "demo_ability": {
    582       "score": 0,
    583       "justification": "No code released, no weights published, no accessible tool. Cannot be tried by readers. Full reimplementation required to reproduce."
    584     },
    585     "brand_recognition": {
    586       "score": 1,
    587       "justification": "Authors from UC Irvine (solid but not top-tier AI lab). LLaMA is Meta (brand-recognized) but standard baseline. No novelty in model selection."
    588     }
    589   },
    590   "hn_data": {
    591     "threads": [
    592       {
    593         "hn_id": "41671808",
    594         "title": "First Past the Post: Evaluating Query Optimization in MongoDB",
    595         "points": 4,
    596         "comments": 0,
    597         "url": "https://news.ycombinator.com/item?id=41671808",
    598         "created_at": "2024-09-27T15:36:58Z"
    599       },
    600       {
    601         "hn_id": "45220460",
    602         "title": "Perihelion precession of planetary orbits solved from quantum field theory",
    603         "points": 3,
    604         "comments": 4,
    605         "url": "https://news.ycombinator.com/item?id=45220460",
    606         "created_at": "2025-09-12T09:48:24Z"
    607       },
    608       {
    609         "hn_id": "45302119",
    610         "title": "VCBench: Benchmarking LLMs in Venture Capital",
    611         "points": 1,
    612         "comments": 0,
    613         "url": "https://news.ycombinator.com/item?id=45302119",
    614         "created_at": "2025-09-19T14:32:42Z"
    615       }
    616     ],
    617     "top_points": 4,
    618     "total_points": 8,
    619     "total_comments": 4
    620   }
    621 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs