scan.json (25908B)
1 { 2 "paper": { 3 "title": "SurvivEHR: a competing risks, time-to-event foundation model for multiple long-term conditions from primary care electronic health records", 4 "authors": [ 5 "Charles Gadd", 6 "Krishna Gokhale", 7 "Aditya Acharya", 8 "Jennifer Cooper", 9 "Francesca Crowe", 10 "Leah Fitzsimmons", 11 "Thomas Jackson", 12 "Krishnarajah Nirantharakumar", 13 "Christopher Yau" 14 ], 15 "year": 2025, 16 "venue": "medRxiv", 17 "doi": "10.1101/2025.08.04.25332916" 18 }, 19 "scan_version": 2, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval"], 22 "key_findings": "SurvivEHR, a GPT-based foundation model trained on 7.6 billion coded events from 23 million UK primary care patients, achieves strong next-event risk stratification (IEC 0.994) and outperforms baseline survival models (RSF, DeepHit, DeSurv) on clinical prediction tasks when fine-tuned. The model captures known clinical associations (e.g., T2DM→metformin, depression→anxiety) and shows particular benefit in low-resource fine-tuning settings (<100K patients). Zero-shot performance is limited, and multi-step forecasting degrades after 4 steps.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "Code released at http://github.com/cwlgadd/FastEHR and http://github.com/cwlgadd/SurvivEHR under open-source licence, as stated in the Code release policy section." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": false, 33 "justification": "Raw CPRD data is not publicly available. 'Raw data from the study are not publicly available. Data for the study were obtained under licence from CPRD.' Codelists are released on GitHub but the actual patient data is restricted." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "No mention of requirements.txt, Dockerfile, conda environment, or specific library versions in the paper text. The FastEHR toolkit is described but environment setup details are not provided in the paper." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper states 'We provide detailed instructions to allow others to retrain the model from scratch on appropriately licensed data' but these instructions are in the code repositories, not in the paper itself. The paper lacks step-by-step reproduction instructions." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": true, 50 "justification": "Table 2 reports 95% confidence intervals over 5 random seeds for all fine-tuning experiments (e.g., 'SFT 0.816 ± 0.003')." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "Despite claiming SurvivEHR-FFT 'achieves superior predictive performance,' no statistical significance tests are reported. Comparisons rely on point estimates with confidence intervals but no formal tests (e.g., paired t-test, Wilcoxon)." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Results in Table 2 provide absolute metric values with baselines for comparison, giving sufficient context to assess effect magnitude (e.g., Ctd of 0.824 vs 0.772 for DeSurv on hypertension task)." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "No power analysis or justification for the cohort sizes used. The 572,096 T2DM cohort and 20,000 multimorbidity cohort sizes are stated but not justified." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": true, 70 "justification": "Table 2 reports results as 'average and 95% confidence interval over 5 random seeds' for all fine-tuning and baseline models." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Multiple baselines included: Random Survival Forests (RSF), DeepHit, DeSurv, a prevalence-based baseline, and a cross-entropy pre-training alternative. Also scratch fine-tuning (SFT) vs full fine-tuning (FFT) comparison." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "DeSurv (2022), DeepHit (2018), and RSF are established survival analysis baselines. The paper also explains in Table 1 why alternative longitudinal models couldn't be compared (lack of competing risk capability)." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "The paper ablates key design choices: competing risk vs single risk, pre-trained (FFT) vs scratch (SFT) to isolate pre-training benefit, and cohort size ablation (Figure 6C) showing performance vs data size." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Three metrics reported: time-dependent concordance (Ctd), Integrated Brier Score (IBS), and Integrated Negative Binomial Log-Likelihood (INBLL)." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "No clinical expert evaluation of model predictions is included. Evaluation is entirely automated via survival metrics. Given the clinical application domain, expert evaluation of predicted trajectories would be valuable." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "90-5-5 site-level split: 23.6M training, 1.4M validation, 1.5M test patients, 'dividing patients by general practices in England to avoid data leakage across practice.'" 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "IEC scores broken down per event type (diagnoses, investigations, medications) in Figure 3A and Supplementary Figures. Fine-tuning evaluated across three distinct clinical tasks." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "Multi-step forecasting degradation discussed (Figure 3B): 'after four steps it is no longer able to outperform the prevalence-based baseline.' Zero-shot failure noted. Non-causal predictions discussed (NSAID after osteoporosis, SSRIs after visual impairment)." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "Zero-shot performance failure is explicitly reported: 'zero-shot predictive performance is not attained by SurvivEHR.' Multi-step degradation and learnt positional embeddings showing 'no benefit for the increased computational cost' also reported." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "Abstract claims of 'strong risk stratification performance' (IEC 0.994), 'outperforms benchmark survival models' (Table 2), 'transfers effectively to fine-tuned prognostic tasks, particularly in low-resource settings' (Figure 6C) are all supported by results." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper is careful about causal claims, explicitly stating 'this experiment should not be considered causal prediction since SurvivEHR only learns associative and not causal relationships.' Ablation claims (SFT vs FFT) are justified by controlled single-variable manipulation." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper bounds generalization to UK primary care data: 'our specific instance of SurvivEHR is trained only on a UK population and its most prevalent MLTCs.' The Discussion acknowledges data is from one health system." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": true, 139 "justification": "The paper discusses that improved performance could be from architecture alone (addressed via SFT vs FFT comparison), that predictions are associative not causal, and that the model is subject to biases in training data." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper clearly distinguishes between what is measured (IEC, Ctd, IBS, INBLL on specific clinical tasks) and the broader goal (clinical risk prediction for MLTCs). Claims match the granularity of measurements." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": true, 151 "justification": "This is a custom model architecture fully described in the Methods. The 384-dimensional latent space, transformer architecture details, and DeSurv survival head are specified. Hyperparameters in Supplementary Table S7." 152 }, 153 "prompts_provided": { 154 "applies": false, 155 "answer": false, 156 "justification": "The paper does not use prompting. SurvivEHR is trained from structured EHR data, not via language model prompting." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "'Model hyperparameters used for training are given in Supplementary Table S7.' Context window of 512 for fine-tuning is also stated." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. This is a standard model training and evaluation pipeline." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Extensive preprocessing documentation: DExtER extraction, FastEHR pipeline, code lists from DM+D, inclusion criteria (English patients with 12 months acceptable data before Jan 2005), 90-5-5 site-level split. Post-processing counts provided (51M diagnoses, 4B medications, etc.)." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "The Discussion section contains substantial limitations discussion covering: zero-shot failure, multi-step degradation, associative-only learning, training data biases, lack of free text data, privacy limitations on model sharing." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "Specific threats discussed: model cannot produce causal predictions, subject to biases in training data, free text omission from CPRD creates known biases (ref [40]), generative capability could recapitulate real records limiting model sharing, limited to UK population." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": true, 188 "justification": "Explicitly states: trained only on UK population, 74 specific long-term conditions, no free text data, no causal predictions. States zero-shot is not achieved. Explicitly notes the model is 'trained only on a UK population and its most prevalent MLTCs.'" 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": false, 195 "justification": "Raw CPRD data is not publicly available: 'Raw data from the study are not publicly available.' Access requires Research Data Governance approval from CPRD." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Detailed description of CPRD data source, DExtER extraction, code lists for 74 conditions, 81 medication classes, 108 test types. Inclusion criteria, time period, and data quality measures described." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants recruited. The data comes from a routine primary care database (CPRD) covering participating GP practices. This is an observational study using existing administrative data." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "Full pipeline documented: CPRD → DExtER extraction → FastEHR preprocessing (tokenisation, cleaning, outlier removal, deduplication) → 90-5-5 site-level split. Final counts provided at each stage: 26.5M patients → 23.6M/1.4M/1.5M split." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Funding disclosed: NIHR Artificial Intelligence for Multiple Long-Term Conditions (Award ID: NIHR202632), UK EPSRC (EP/Y018192/1), UKRI Turing AI Acceleration Fellowship (EP/V023233/1)." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "All author affiliations listed: University of Oxford, University of Birmingham, King's College London, Health Data Research UK." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": true, 227 "justification": "Funding from NIHR, EPSRC, and UKRI — public research funders with no financial stake in the model's performance. 'The views expressed are those of the author(s) and not necessarily those of the NIHR or the Department of Health and Social Care.'" 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": true, 232 "justification": "'The authors declare no competing interests' — explicit competing interests statement provided." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": false, 238 "answer": false, 239 "justification": "This paper trains its own model on CPRD data with controlled site-level train/test splits. It does not evaluate a pre-trained LLM on public benchmarks. Contamination in the LLM-benchmark sense is not applicable." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": false, 243 "answer": false, 244 "justification": "Not applicable in the LLM benchmark contamination sense. The paper controls its own train/test split at the practice site level specifically to prevent leakage." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": false, 248 "answer": false, 249 "justification": "Not applicable — the paper does not evaluate against pre-existing public benchmarks that could be in an LLM's training data." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants. This is a model development study using existing administrative health records from CPRD." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants recruited. CPRD data access was approved under study reference ID 21_000683, which involves data governance rather than human subjects ethics review." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants. Patient demographics of the CPRD cohort are reported (Figure 2C) but this is descriptive statistics of the training data, not a human subjects study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants recruited. Data inclusion criteria for CPRD records are documented but this is data filtering, not human subject recruitment." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants. This is a model development study, not a human subjects experiment." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants. Not applicable to a model training study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants. Not applicable." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No inference cost, latency, or computational cost per prediction is reported despite the model being proposed for clinical use." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "No training compute budget stated — no GPU hours, training time, or hardware specifications mentioned for the pre-training or fine-tuning experiments." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": true, 305 "justification": "Table 2 reports results 'over 5 random seeds' with confidence intervals for all fine-tuning experiments." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": true, 310 "justification": "'We report the average and 95% confidence interval over 5 random seeds' is explicitly stated for Table 2." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "No hyperparameter search budget reported. Hyperparameters are listed in Supplementary Table S7 but how they were selected is not described." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": false, 320 "justification": "No description of how hyperparameters or the best configuration were selected. A validation set exists but the selection procedure is not documented." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "No statistical tests are performed, so no multiple comparison correction is applied. The paper compares across multiple tasks and models without formal testing." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors implement both their own system and all baselines without acknowledging the bias of evaluating their own system. RSF, DeepHit, and DeSurv implementations are not stated to be official or independently validated." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "SurvivEHR uses full patient histories (longitudinal) while baselines use cross-sectional input, and is pre-trained on 23M patients. The compute advantage is not discussed or controlled for in comparisons." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": true, 340 "justification": "The paper discusses construct validity of its evaluation: proposes the new IEC metric with justification, explains why standard concordance metrics are impractical for sparse EHR events, and discusses the gap between pre-training and evaluation objectives." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No agentic scaffolding is involved. This is a traditional ML model comparison." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": true, 352 "justification": "The site-level split prevents temporal leakage. Fine-tuning experiments use index dates with 'subsequent events excluded until either an outcome is observed or the last observation within the study period.' FastEHR includes leakage control." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": true, 357 "justification": "The paper addresses feature leakage through careful fine-tuning design: 'we increase the context window to 512, prepend any previous diagnoses which may otherwise be lost, and remove repeated events.' For risk factor experiments, weight values are masked to ensure self-consistency." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": true, 362 "justification": "Site-level splitting explicitly addresses non-independence: '90-5-5 site-level training, validation and test random split, dividing patients by general practices in England to avoid data leakage across practice.'" 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": true, 367 "justification": "Site-level splitting is a concrete leakage prevention method. FastEHR is described as enabling 'controlling for general practice- and dataset-level data leakage.' This is a structural prevention approach rather than post-hoc detection." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "SurvivEHR achieves a marginal IEC of 0.994, meaning the true next event was among the top ~1.6 predicted events on average.", 374 "evidence": "Figure 3A and Results section: 'We obtain a marginal IEC of 0.994' compared to 0.864 for prevalence baseline and 0.699 for cross-entropy pre-training.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "SurvivEHR-FFT achieves superior predictive performance to all benchmarks on clinical risk prediction tasks.", 379 "evidence": "Table 2: FFT achieves best Ctd (0.824 vs 0.772 DeSurv), IBS (0.0765 vs 0.0826), and INBLL (0.242 vs 0.262) on hypertension, and best across all metrics on CVD and multimorbidity tasks. Results over 5 seeds with CIs.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "Pre-training provides substantial benefits in low-resource fine-tuning settings with fewer than 100,000 patients.", 384 "evidence": "Figure 6C: ablation showing FFT maintains performance with reduced cohort sizes while SFT and other baselines degrade more sharply.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "SurvivEHR captures clinically meaningful associations such as T2DM→metformin, depression→anxiety, and T2DM→osteoarthritis.", 389 "evidence": "Figures 4A-C: transition probability matrices showing known clinical relationships recovered from model predictions, with clinical literature citations supporting each association.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "Multi-step forecasting degrades after four steps, falling below the prevalence baseline.", 394 "evidence": "Figure 3B: marginal IEC evaluated at increasing prediction horizons shows degradation beyond 4 steps.", 395 "supported": "strong" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "No compute budget reported", 401 "detail": "Training a transformer on 7.6 billion events from 23 million patients requires substantial compute, but no GPU hours, hardware specs, or training time is reported. This limits reproducibility and practical assessment." 402 }, 403 { 404 "flag": "Unfair compute comparison with baselines", 405 "detail": "SurvivEHR uses full longitudinal histories and is pre-trained on 23M patients, while baselines (RSF, DeepHit, DeSurv) use only cross-sectional input. The information advantage is not discussed as a confound." 406 }, 407 { 408 "flag": "No statistical significance tests", 409 "detail": "Claims of 'superior performance' rely on comparing point estimates and CIs but no formal statistical tests are reported. For some tasks (e.g., CVD), differences between FFT and DeSurv are within overlapping confidence intervals." 410 }, 411 { 412 "flag": "Restricted data limits independent verification", 413 "detail": "CPRD data requires governance approval, and model weights cannot be shared due to privacy concerns. While understandable for medical data, this means results cannot be independently verified." 414 } 415 ], 416 "cited_papers": [ 417 { 418 "title": "The shaky foundations of large language models and foundation models for electronic health records", 419 "authors": ["Michael Wornow"], 420 "year": 2023, 421 "relevance": "Critiques foundation models for EHR, directly relevant to evaluating methodology quality of foundation model claims." 422 }, 423 { 424 "title": "MOTOR: a time-to-event foundation model for structured medical records", 425 "authors": ["Ethan Steinberg"], 426 "year": 2023, 427 "relevance": "Most directly comparable prior work — a time-to-event foundation model for EHR, key baseline for methodological comparison." 428 }, 429 { 430 "title": "Are emergent abilities of large language models a mirage?", 431 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 432 "year": 2024, 433 "relevance": "Questions emergent capabilities claims in large models, relevant to evaluating overclaiming in foundation model papers." 434 }, 435 { 436 "title": "On the opportunities and risks of foundation models", 437 "authors": ["Rishi Bommasani"], 438 "year": 2021, 439 "arxiv_id": "2108.07258", 440 "relevance": "Foundational paper on foundation model risks and opportunities, directly relevant to survey scope." 441 }, 442 { 443 "title": "Scaling laws for neural language models", 444 "authors": ["Jared Kaplan"], 445 "year": 2020, 446 "arxiv_id": "2001.08361", 447 "relevance": "Scaling laws paper referenced for model design decisions, relevant to AI methodology." 448 }, 449 { 450 "title": "Attention is all you need", 451 "authors": ["Ashish Vaswani"], 452 "year": 2017, 453 "relevance": "Foundational transformer architecture paper, core to the model design." 454 } 455 ] 456 }