scan-v4.json (27656B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Diagnostic Codes in AI prediction models and Label Leakage of Same-admission Clinical Outcomes", 6 "authors": [ 7 "Bashar Ramadan", 8 "Ming-Chieh Liu", 9 "Michael C. Burkhart", 10 "William F Parker", 11 "Brett K. Beaulieu-Jones" 12 ], 13 "year": 2025, 14 "venue": "medRxiv preprint", 15 "arxiv_id": "", 16 "doi": "10.1101/2025.08.09.25333360" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "Abstract claims of AUROC 0.97-0.98, the specific ICD codes as top predictors ('brain death,' 'cardiac arrest,' 'encounter for palliative care,' 'do not resuscitate'), and 40.2% prevalence in the literature are all directly supported by results in the paper.", 24 "source": "opus" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "The causal claim that ICD codes 'inflate' performance via label leakage is supported by a transparent mechanism: codes like 'brain death' and 'cardiac arrest' are only assigned after the clinical event occurs, making their use to predict that event circular. The feature importance analysis directly demonstrates this mechanism.", 30 "source": "opus" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The title 'Diagnostic Codes in AI prediction models' frames the problem generally, and the Discussion states 'It is very unlikely that this problem is isolated to MIMIC database work but reflects a broader challenge.' However, both analyses are limited to MIMIC and Google Scholar-indexed papers. The framing extends beyond what was empirically tested.", 36 "source": "opus" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": true, 41 "justification": "The paper discusses that some diagnoses may be known early in admission (e.g., broken limbs, chronic conditions carried over from prior visits), that problem list codes might be available during a stay, and offers an alternative explanation for the 'external hemorrhoids' anomaly (clinician documentation patterns signaling patient stability).", 42 "source": "opus" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper's claims match the granularity of its measurements. It measures AUROC on MIMIC-IV and claims this demonstrates label leakage on MIMIC-IV. The distinction between what ICD codes represent (billing/clinical thinking) versus patient state is explicitly discussed.", 48 "source": "opus" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": false, 55 "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. Limitations are discussed within the Discussion section but are not set apart as a distinct subsection.", 56 "source": "opus" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Specific threats are discussed: the study is limited to MIMIC only; other datasets with timestamped ICD codes may not have this problem; ICD codes represent billing/clinical thinking rather than patient state; the literature review used only Google Scholar and top-cited papers.", 62 "source": "opus" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper explicitly states: 'Both analyses in our study are limited because they only the benchmark MIMIC dataset' and 'While it is not possible to estimate how frequently this occurs on private, institutional datasets.' It also notes MIMIC lacks timestamps and problem list codes.", 68 "source": "opus" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding source, grants, or sponsorship are mentioned anywhere in the paper.", 76 "source": "opus" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are clearly listed: Center for Computational Medicine and Clinical Artificial Intelligence, Department of Medicine, University of Chicago; MacLean Center for Clinical Medical Ethics, University of Chicago.", 82 "source": "opus" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": false, 87 "justification": "No funding information is disclosed, making it impossible to assess funder independence.", 88 "source": "opus" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement or financial disclosure is present in the paper.", 94 "source": "opus" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Label leakage defined with concrete example ('including ICD code for cardiac arrest into inpatient mortality model'). ICD codes identified as 'diagnostic billing codes.' Inpatient mortality standard term.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Two-part contribution explicit in abstract: (1) empirical demonstration of label leakage magnitude in MIMIC via predictive modeling, (2) prevalence assessment via systematic review.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Cites Davis et al. (2023) label leakage framework and states 'This work extends further to clearly illustrate impact.' Engages with Zech et al., Beaulieu-Jones et al. on healthcare ML methodology. Positioned relative to prior work.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper states 'Full source code is available on Github (https://github.com/bbj-lab/data-leakage).' A working URL is provided.", 125 "source": "opus" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "The study uses MIMIC-IV v2.2, 'a publicly available deidentified electronic healthcare record database.' This is a standard public dataset accessible through PhysioNet.", 131 "source": "opus" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper mentions scikit-learn and XGBoost libraries but provides no requirements.txt, Dockerfile, library versions, or detailed environment specification sufficient to recreate the computational environment.", 137 "source": "opus" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "While code is released on GitHub, the paper itself contains no step-by-step reproduction instructions, README description, or 'Reproducing Results' section.", 143 "source": "opus" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "AUROCs are reported as point estimates (0.97-0.98) with no confidence intervals, error bars, or uncertainty quantification.", 151 "source": "opus" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": true, 156 "justification": "The paper applies the Benjamini-Hochberg procedure to control for false discovery rate with a threshold of p < 0.05 for ICD codes in the logistic regression model.", 157 "source": "opus" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Odds ratios are reported for ICD codes in the logistic regression model, and AUROCs provide effect size context. The 40.2% prevalence rate in the literature review also provides magnitude context.", 163 "source": "opus" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "No power analysis or sample size justification is provided for either the MIMIC-IV cohort (180,640 patients) or the literature review sample (100 papers, with 50 per MIMIC version chosen arbitrarily).", 169 "source": "opus" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "Results appear to be from single experimental runs. No variance, standard deviation, or spread measures across multiple runs are reported for any model.", 175 "source": "opus" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "The paper compares ICD-only model performance against published models: 'These results are even better than published models trained on the same data that also included many additional predictive features' (refs 1, 2). Three model types (LR, RF, XGBoost) are also compared against each other.", 183 "source": "opus" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "Baselines include Renc et al. (2024) and Rajkomar et al. (2018). The former is contemporary; the latter is a foundational reference in healthcare AI. The comparison purpose is to show ICD-only models match or exceed full-feature models.", 189 "source": "opus" 190 }, 191 "ablation_study": { 192 "applies": false, 193 "answer": false, 194 "justification": "The system is intentionally minimal by design (ICD codes + age + sex) to demonstrate label leakage, not to optimize a multi-component system. There is essentially one component being tested.", 195 "source": "opus" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "The paper reports both AUROC and balanced accuracy as evaluation metrics: 'performance assessed using AUROC and balanced accuracy.'", 201 "source": "opus" 202 }, 203 "human_evaluation": { 204 "applies": false, 205 "answer": false, 206 "justification": "Human evaluation is irrelevant to demonstrating label leakage in ML models. The claim is about data leakage mechanics, not output quality requiring human judgment.", 207 "source": "opus" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "The paper uses a temporal train/validation/test split: 'We partitioned the dataset by the date of admission into train (70%), validation (10%), test (20%) sets per TRIPOD-AI+ guidelines,' with explicit exclusion of patients appearing in multiple splits.", 213 "source": "opus" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Feature importance breakdowns are provided for all three model types: odds ratios and p-values for logistic regression (Figure 1B), Gini importance for random forest, and gain for XGBoost (Figure 1C). Results are reported per model.", 219 "source": "opus" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "The paper discusses the anomalous appearance of 'external hemorrhoids without complications' as a predictor: 'This anomaly may reflect the model's ability to detect a clinician's focus on documenting less severe conditions, signaling relative patient stability.'", 225 "source": "opus" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": false, 230 "justification": "All three models show uniformly high AUROC (0.97-0.98). No configurations that failed, approaches that were tried and abandoned, or experiments showing lower performance are reported.", 231 "source": "opus" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": false, 238 "justification": "The paper names scikit-learn (refs 11) and XGBoost (ref 12) but provides no library version numbers or specific model configurations beyond the model type names.", 239 "source": "opus" 240 }, 241 "prompts_provided": { 242 "applies": false, 243 "answer": false, 244 "justification": "The paper uses traditional ML models (logistic regression, random forest, XGBoost), not prompted language models. No prompting is involved.", 245 "source": "opus" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": false, 250 "justification": "The paper mentions 'tuning hyperparameters in validation set' but does not report any actual hyperparameter values, search ranges, or final selected configurations for any of the three models.", 251 "source": "opus" 252 }, 253 "scaffolding_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "No agentic scaffolding is used. The paper trains standard ML classifiers.", 257 "source": "opus" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Preprocessing steps are documented: ICD-10 to ICD-9 conversion, removal of low-variance (<0.0001) and high-covariance (>0.8) ICD codes, temporal train/validation/test split with patient-level exclusion. The literature review filtering pipeline is also documented in Figure 2 with counts at each stage.", 263 "source": "opus" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": true, 270 "justification": "MIMIC-IV v2.2 is publicly available through PhysioNet for credentialed users, enabling independent verification of the prediction model results.", 271 "source": "opus" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Data collection is described: 'MIMIC-IV v2.2, a publicly available deidentified electronic healthcare record database of patients admitted to an ICU or emergency department at Beth Israel Deaconess Medical Center between 2008 and 2019.' The literature review search strategy is also described with specific queries and date.", 277 "source": "opus" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants were recruited. The prediction study uses a standard benchmark database (MIMIC-IV), and the literature review screens published papers.", 283 "source": "opus" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "The prediction pipeline documents inclusion criteria (<1% excluded), preprocessing steps (ICD-10→ICD-9, variance/covariance filtering), and split methodology. The literature review pipeline is documented in Figure 2 with counts at each stage (140→128→122→100→92→37).", 289 "source": "opus" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": false, 295 "answer": false, 296 "justification": "The paper trains traditional ML models (logistic regression, random forest, XGBoost) from scratch on MIMIC-IV data. No pre-trained model's capability is being evaluated on a benchmark.", 297 "source": "opus" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": false, 301 "answer": false, 302 "justification": "No pre-trained model is evaluated on a benchmark. The models are trained from scratch with a temporal split, making pre-training contamination inapplicable.", 303 "source": "opus" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": false, 307 "answer": false, 308 "justification": "No pre-trained model is evaluated on a benchmark. The paper trains standard ML classifiers from scratch.", 309 "source": "opus" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants. The study analyzes an existing database (MIMIC-IV) and reviews published literature.", 317 "source": "opus" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants. MIMIC-IV is a deidentified public dataset that does not require per-study IRB approval.", 323 "source": "opus" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants. Patient demographics are reported for the MIMIC-IV cohort (mean age 58.7, 53% female) as data characteristics, not as study participant demographics.", 329 "source": "opus" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants were recruited for this study.", 335 "source": "opus" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants. This is not an experimental study with human subjects.", 341 "source": "opus" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants. No experimental conditions requiring blinding.", 347 "source": "opus" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants.", 353 "source": "opus" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": false, 360 "justification": "No inference cost, latency, or computational time is reported for any of the three trained models.", 361 "source": "opus" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "No computational budget, hardware specification, or training time is reported despite training three ML models on 180,640 patients.", 367 "source": "opus" 368 } 369 }, 370 "survey_methodology": { 371 "prisma_or_structured_protocol": { 372 "applies": true, 373 "answer": true, 374 "justification": "The literature review uses reproducible search queries on Google Scholar (two specific queries stated), a defined sorting criterion (citations per year), a stopping rule (100 papers), and a PRISMA-style flow diagram (Figure 2) with counts at each screening stage.", 375 "source": "opus" 376 }, 377 "quality_assessment_of_sources": { 378 "applies": true, 379 "answer": false, 380 "justification": "The review categorizes papers by whether they use ICD codes as features but does not assess the methodological quality of included studies. No quality scoring rubric or risk-of-bias assessment is applied.", 381 "source": "opus" 382 }, 383 "publication_bias_discussed": { 384 "applies": true, 385 "answer": false, 386 "justification": "No discussion of publication bias. The review sorted by citations per year, which introduces selection bias toward highly cited papers, but this is not discussed as a limitation.", 387 "source": "opus" 388 } 389 } 390 } 391 }, 392 "claims": [ 393 { 394 "claim": "40.2% of published MIMIC-based studies predicting same-admission outcomes use ICD diagnostic codes despite documentation that codes are only finalized after discharge", 395 "evidence": "Systematic literature review: screened 100 papers, found 92 predicting same-admission outcomes, 37/92 used ICD codes as input features", 396 "supported": "strong" 397 }, 398 { 399 "claim": "Prediction models trained only on age, sex, and ICD-9 codes achieve 0.97-0.98 AUROC for in-hospital mortality", 400 "evidence": "Logistic regression (AUROC 0.98), random forest (0.97), XGBoost (0.97) trained and evaluated on MIMIC-IV with 180,640 patients, tested on 36,128-patient held-out set", 401 "supported": "strong" 402 }, 403 { 404 "claim": "The most important predictive ICD codes for mortality are post-discharge diagnoses that represent pure label leakage (brain death, cardiac arrest, encounter for palliative care, DNR status)", 405 "evidence": "Feature importance analysis (Figure 1B: LR odds ratios with p<0.05; 1C: RF and XGB importance) shows these codes rank highest despite obvious unavailability at prediction time", 406 "supported": "strong" 407 }, 408 { 409 "claim": "Using ICD codes renders same-admission prediction models clinically useless because codes are finalized after discharge and thus unavailable for real-time decision-making", 410 "evidence": "Logical argument supported by feature importance analysis showing dominant codes (brain death, palliative care) are post-hoc outcomes. No empirical demonstration of deployment failure.", 411 "supported": "moderate" 412 }, 413 { 414 "claim": "ICD code label leakage is common in healthcare machine learning research broadly, extending beyond MIMIC to other institutions and datasets", 415 "evidence": "Limited to MIMIC-IV. Generalization stated without empirical evidence from other datasets: 'It is very unlikely this problem is isolated to MIMIC.'", 416 "supported": "weak" 417 } 418 ], 419 "methodology_tags": [ 420 "observational", 421 "benchmark-eval", 422 "case-study" 423 ], 424 "key_findings": "Forty percent of MIMIC-based studies predicting same-admission mortality use diagnostic codes (ICD-9/10) that are legally finalized only after hospital discharge, creating severe label leakage. Models trained on only age, sex, and ICD codes achieve 0.97-0.98 AUROC, with top predictors being obviously post-hoc outcomes (brain death, cardiac arrest, palliative care encounter) rather than early clinical signals. This renders published models incapable of real-time deployment despite impressive benchmark performance, demonstrating a widespread methodological flaw in healthcare ML research.", 425 "red_flags": [ 426 { 427 "flag": "Over-generalization without evidence", 428 "detail": "Findings limited to MIMIC-IV (one institution, 2008-2019) yet conclusions generalize to 'healthcare machine learning research broadly' via speculation rather than empirical evidence from other datasets." 429 }, 430 { 431 "flag": "No counterfactual comparison", 432 "detail": "Paper shows ICD-only models achieve 0.97+ AUROC but never compares models with/without ICD codes on identical data to isolate causal contribution of label leakage." 433 }, 434 { 435 "flag": "Systematic review sampling bias", 436 "detail": "Papers sorted by 'citations per year' which may bias toward well-cited studies with better methodology, potentially underestimating true prevalence of label leakage in the population." 437 }, 438 { 439 "flag": "Missing statistical rigor on main results", 440 "detail": "AUROC point estimates (0.97-0.98) reported with no confidence intervals, error bars, or variance measures. No power analysis or sample size justification." 441 }, 442 { 443 "flag": "Limited hyperparameter transparency", 444 "detail": "Hyperparameters tuned on validation set but specific values (learning rates, tree depths, regularization) not reported, reducing reproducibility." 445 }, 446 { 447 "flag": "No practical deployment validation", 448 "detail": "Claims models are 'clinically useless' but does not empirically test predictions in real-time clinical workflows or against prospective data." 449 }, 450 { 451 "flag": "Single-institution generalization", 452 "detail": "Data from one hospital (Beth Israel Deaconess) across 11 years; population and temporal scope not explicitly bounded despite geographic/temporal specificity." 453 } 454 ], 455 "cited_papers": [ 456 { 457 "title": "A framework for understanding label leakage in machine learning for health care", 458 "relevance": "Conceptual foundation for label leakage in healthcare; directly cited as prior framework" 459 }, 460 { 461 "title": "Scalable and accurate deep learning for electronic health records", 462 "relevance": "Major published model on MIMIC that may suffer from label leakage; exemplifies the problem" 463 }, 464 { 465 "title": "Machine learning for patient risk stratification: standing on, or looking over, the shoulders of clinicians?", 466 "relevance": "Prior critique of healthcare ML research-deployment gap; related methodological concerns" 467 }, 468 { 469 "title": "Variable generalization performance of a deep learning model to detect pneumonia in chest radiographs", 470 "relevance": "Documents shortcut learning and generalization failure in healthcare ML; parallel methodological issue" 471 }, 472 { 473 "title": "MIMIC-III, a freely accessible critical care database", 474 "relevance": "Original MIMIC documentation; explicitly warns against same-admission ICD code use" 475 }, 476 { 477 "title": "MIMIC-IV, a freely accessible electronic health record dataset", 478 "relevance": "Current MIMIC documentation clarifying post-discharge timing of ICD codes; primary dataset" 479 } 480 ], 481 "engagement_factors": { 482 "practical_relevance": { 483 "score": 2, 484 "justification": "Directly actionable for anyone building clinical prediction models on EHR data: do not use same-admission ICD codes as features." 485 }, 486 "surprise_contrarian": { 487 "score": 2, 488 "justification": "The finding that 40% of published MIMIC studies contain this basic methodological flaw challenges confidence in healthcare AI literature quality." 489 }, 490 "fear_safety": { 491 "score": 2, 492 "justification": "Raises patient safety concerns: clinically deployed models built on leaked features would produce useless predictions for real-time clinical decision-making." 493 }, 494 "drama_conflict": { 495 "score": 2, 496 "justification": "Effectively calls out a large fraction of the healthcare AI literature as methodologically flawed, with AUROC inflation rendering models clinically useless." 497 }, 498 "demo_ability": { 499 "score": 1, 500 "justification": "Code is on GitHub for replication, but the paper is a methodology critique rather than a usable tool." 501 }, 502 "brand_recognition": { 503 "score": 1, 504 "justification": "University of Chicago affiliation is respected but not a household name; MIMIC is well-known within healthcare AI but not to general audiences." 505 } 506 }, 507 "hn_data": { 508 "threads": [], 509 "top_points": 0, 510 "total_points": 0, 511 "total_comments": 0 512 } 513 }