scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25126B)
      1 {
      2   "paper": {
      3     "title": "Diagnostic Codes in AI prediction models and Label Leakage of Same-admission Clinical Outcomes",
      4     "authors": [
      5       "Bashar Ramadan",
      6       "Ming-Chieh Liu",
      7       "Michael C. Burkhart",
      8       "William F. Parker",
      9       "Brett K. Beaulieu-Jones"
     10     ],
     11     "year": 2025,
     12     "venue": "medRxiv",
     13     "doi": "10.1101/2025.08.09.25333360"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["survey_methodology"],
     17   "methodology_tags": ["observational", "meta-analysis"],
     18   "key_findings": "ICD diagnostic codes, finalized only after hospital discharge, can predict in-hospital mortality with AUROC 0.97-0.98 when used as the sole features in ML models trained on MIMIC-IV, with top predictors being 'brain death,' 'cardiac arrest,' and 'encounter for palliative care.' A systematic literature review found that 40.2% (37/92) of published MIMIC-based prediction studies use same-admission ICD codes as input features despite both MIMIC publications explicitly warning against this practice. This widespread label leakage renders the resulting models clinically useless for real-time prediction.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper states 'Full source code is available on Github (https://github.com/bbj-lab/data-leakage).' A working URL is provided."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The study uses MIMIC-IV v2.2, 'a publicly available deidentified electronic healthcare record database.' This is a standard public dataset accessible through PhysioNet."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper mentions scikit-learn and XGBoost libraries but provides no requirements.txt, Dockerfile, library versions, or detailed environment specification sufficient to recreate the computational environment."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "While code is released on GitHub, the paper itself contains no step-by-step reproduction instructions, README description, or 'Reproducing Results' section."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "AUROCs are reported as point estimates (0.97-0.98) with no confidence intervals, error bars, or uncertainty quantification."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper applies the Benjamini-Hochberg procedure to control for false discovery rate with a threshold of p < 0.05 for ICD codes in the logistic regression model."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Odds ratios are reported for ICD codes in the logistic regression model, and AUROCs provide effect size context. The 40.2% prevalence rate in the literature review also provides magnitude context."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No power analysis or sample size justification is provided for either the MIMIC-IV cohort (180,640 patients) or the literature review sample (100 papers, with 50 per MIMIC version chosen arbitrarily)."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Results appear to be from single experimental runs. No variance, standard deviation, or spread measures across multiple runs are reported for any model."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper compares ICD-only model performance against published models: 'These results are even better than published models trained on the same data that also included many additional predictive features' (refs 1, 2). Three model types (LR, RF, XGBoost) are also compared against each other."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Baselines include Renc et al. (2024) and Rajkomar et al. (2018). The former is contemporary; the latter is a foundational reference in healthcare AI. The comparison purpose is to show ICD-only models match or exceed full-feature models."
     79       },
     80       "ablation_study": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "The system is intentionally minimal by design (ICD codes + age + sex) to demonstrate label leakage, not to optimize a multi-component system. There is essentially one component being tested."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper reports both AUROC and balanced accuracy as evaluation metrics: 'performance assessed using AUROC and balanced accuracy.'"
     89       },
     90       "human_evaluation": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Human evaluation is irrelevant to demonstrating label leakage in ML models. The claim is about data leakage mechanics, not output quality requiring human judgment."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper uses a temporal train/validation/test split: 'We partitioned the dataset by the date of admission into train (70%), validation (10%), test (20%) sets per TRIPOD-AI+ guidelines,' with explicit exclusion of patients appearing in multiple splits."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Feature importance breakdowns are provided for all three model types: odds ratios and p-values for logistic regression (Figure 1B), Gini importance for random forest, and gain for XGBoost (Figure 1C). Results are reported per model."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper discusses the anomalous appearance of 'external hemorrhoids without complications' as a predictor: 'This anomaly may reflect the model's ability to detect a clinician's focus on documenting less severe conditions, signaling relative patient stability.'"
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "All three models show uniformly high AUROC (0.97-0.98). No configurations that failed, approaches that were tried and abandoned, or experiments showing lower performance are reported."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims of AUROC 0.97-0.98, the specific ICD codes as top predictors ('brain death,' 'cardiac arrest,' 'encounter for palliative care,' 'do not resuscitate'), and 40.2% prevalence in the literature are all directly supported by results in the paper."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The causal claim that ICD codes 'inflate' performance via label leakage is supported by a transparent mechanism: codes like 'brain death' and 'cardiac arrest' are only assigned after the clinical event occurs, making their use to predict that event circular. The feature importance analysis directly demonstrates this mechanism."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title 'Diagnostic Codes in AI prediction models' frames the problem generally, and the Discussion states 'It is very unlikely that this problem is isolated to MIMIC database work but reflects a broader challenge.' However, both analyses are limited to MIMIC and Google Scholar-indexed papers. The framing extends beyond what was empirically tested."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper discusses that some diagnoses may be known early in admission (e.g., broken limbs, chronic conditions carried over from prior visits), that problem list codes might be available during a stay, and offers an alternative explanation for the 'external hemorrhoids' anomaly (clinician documentation patterns signaling patient stability)."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper's claims match the granularity of its measurements. It measures AUROC on MIMIC-IV and claims this demonstrates label leakage on MIMIC-IV. The distinction between what ICD codes represent (billing/clinical thinking) versus patient state is explicitly discussed."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper names scikit-learn (refs 11) and XGBoost (ref 12) but provides no library version numbers or specific model configurations beyond the model type names."
    148       },
    149       "prompts_provided": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "The paper uses traditional ML models (logistic regression, random forest, XGBoost), not prompted language models. No prompting is involved."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The paper mentions 'tuning hyperparameters in validation set' but does not report any actual hyperparameter values, search ranges, or final selected configurations for any of the three models."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. The paper trains standard ML classifiers."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Preprocessing steps are documented: ICD-10 to ICD-9 conversion, removal of low-variance (<0.0001) and high-covariance (>0.8) ICD codes, temporal train/validation/test split with patient-level exclusion. The literature review filtering pipeline is also documented in Figure 2 with counts at each stage."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. Limitations are discussed within the Discussion section but are not set apart as a distinct subsection."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Specific threats are discussed: the study is limited to MIMIC only; other datasets with timestamped ICD codes may not have this problem; ICD codes represent billing/clinical thinking rather than patient state; the literature review used only Google Scholar and top-cited papers."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The paper explicitly states: 'Both analyses in our study are limited because they only the benchmark MIMIC dataset' and 'While it is not possible to estimate how frequently this occurs on private, institutional datasets.' It also notes MIMIC lacks timestamps and problem list codes."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "MIMIC-IV v2.2 is publicly available through PhysioNet for credentialed users, enabling independent verification of the prediction model results."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Data collection is described: 'MIMIC-IV v2.2, a publicly available deidentified electronic healthcare record database of patients admitted to an ICU or emergency department at Beth Israel Deaconess Medical Center between 2008 and 2019.' The literature review search strategy is also described with specific queries and date."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants were recruited. The prediction study uses a standard benchmark database (MIMIC-IV), and the literature review screens published papers."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The prediction pipeline documents inclusion criteria (<1% excluded), preprocessing steps (ICD-10→ICD-9, variance/covariance filtering), and split methodology. The literature review pipeline is documented in Figure 2 with counts at each stage (140→128→122→100→92→37)."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding source, grants, or sponsorship are mentioned anywhere in the paper."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly listed: Center for Computational Medicine and Clinical Artificial Intelligence, Department of Medicine, University of Chicago; MacLean Center for Clinical Medical Ethics, University of Chicago."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding information is disclosed, making it impossible to assess funder independence."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests statement or financial disclosure is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "The paper trains traditional ML models (logistic regression, random forest, XGBoost) from scratch on MIMIC-IV data. No pre-trained model's capability is being evaluated on a benchmark."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No pre-trained model is evaluated on a benchmark. The models are trained from scratch with a temporal split, making pre-training contamination inapplicable."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No pre-trained model is evaluated on a benchmark. The paper trains standard ML classifiers from scratch."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The study analyzes an existing database (MIMIC-IV) and reviews published literature."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. MIMIC-IV is a deidentified public dataset that does not require per-study IRB approval."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants. Patient demographics are reported for the MIMIC-IV cohort (mean age 58.7, 53% female) as data characteristics, not as study participant demographics."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants were recruited for this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants. This is not an experimental study with human subjects."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants. No experimental conditions requiring blinding."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No inference cost, latency, or computational time is reported for any of the three trained models."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No computational budget, hardware specification, or training time is reported despite training three ML models on 180,640 patients."
    295       }
    296     },
    297     "survey_methodology": {
    298       "prisma_or_structured_protocol": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "The literature review uses reproducible search queries on Google Scholar (two specific queries stated), a defined sorting criterion (citations per year), a stopping rule (100 papers), and a PRISMA-style flow diagram (Figure 2) with counts at each screening stage."
    302       },
    303       "quality_assessment_of_sources": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The review categorizes papers by whether they use ICD codes as features but does not assess the methodological quality of included studies. No quality scoring rubric or risk-of-bias assessment is applied."
    307       },
    308       "publication_bias_discussed": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No discussion of publication bias. The review sorted by citations per year, which introduces selection bias toward highly cited papers, but this is not discussed as a limitation."
    312       }
    313     }
    314   },
    315   "claims": [
    316     {
    317       "claim": "ML models using only ICD codes, age, and sex can predict in-hospital mortality with AUROC 0.97-0.98 on MIMIC-IV",
    318       "evidence": "Three models (logistic regression, random forest, XGBoost) evaluated on held-out test set (20% temporal split) from 180,640 patients yielded AUROCs of 0.98, 0.97, and 0.97 respectively (Figure 1A, eTable 1).",
    319       "supported": "strong"
    320     },
    321     {
    322       "claim": "The most important ICD codes for mortality prediction are post-discharge diagnoses that represent label leakage",
    323       "evidence": "Feature importance analysis shows 'brain death,' 'cardiac arrest,' 'encounter for palliative care,' and 'do not resuscitate status' as top predictors across all three models (Figure 1B, 1C). These codes document events that occur during or after the clinical outcome.",
    324       "supported": "strong"
    325     },
    326     {
    327       "claim": "40.2% of published MIMIC-based prediction studies use same-admission ICD codes as input features",
    328       "evidence": "Systematic review of 92 papers building prediction models for same-admission outcomes found 37 (40.2%) used ICD codes as features (Figure 2, eTable 2). Papers were selected from 139 Google Scholar results sorted by citations per year.",
    329       "supported": "moderate"
    330     },
    331     {
    332       "claim": "ICD-code based models outperform published models that include additional clinical features",
    333       "evidence": "The paper states ICD-only models achieve 'even better' results than published models (refs 1, 2) 'trained on the same data that also included many additional predictive features from the rest of the electronic medical record.'",
    334       "supported": "moderate"
    335     }
    336   ],
    337   "red_flags": [
    338     {
    339       "flag": "No uncertainty quantification for main results",
    340       "detail": "AUROC values of 0.97-0.98 are reported as point estimates with no confidence intervals, bootstrap intervals, or cross-validation variance. For a paper about methodological rigor in healthcare AI, this is a notable omission."
    341     },
    342     {
    343       "flag": "Literature review limited to single search engine",
    344       "detail": "The systematic review used only Google Scholar. No PubMed, Scopus, or Web of Science searches were conducted. Papers were sorted by citations per year and screening stopped at 100 papers, introducing selection bias toward highly visible work and potentially missing papers with ICD leakage issues."
    345     },
    346     {
    347       "flag": "No hyperparameters reported despite claiming tuning",
    348       "detail": "The paper states hyperparameters were tuned on the validation set but reports none of the actual values, search ranges, or selection criteria for any of the three models."
    349     }
    350   ],
    351   "cited_papers": [
    352     {
    353       "title": "Scalable and accurate deep learning for electronic health records",
    354       "authors": ["Alvin Rajkomar", "Eyal Oren", "Kai Chen"],
    355       "year": 2018,
    356       "arxiv_id": "1801.07860",
    357       "relevance": "Foundational work on deep learning for EHR-based clinical prediction, directly relevant as a baseline that the ICD-only models surpass."
    358     },
    359     {
    360       "title": "Zero shot health trajectory prediction using transformer",
    361       "authors": ["Pawel Renc", "Yugang Jia", "Anthony E. Samir"],
    362       "year": 2024,
    363       "relevance": "Recent transformer-based clinical prediction model using MIMIC, serving as a contemporary baseline for comparison."
    364     },
    365     {
    366       "title": "Machine learning for patient risk stratification: standing on, or looking over, the shoulders of clinicians?",
    367       "authors": ["Brett K. Beaulieu-Jones", "William Yuan", "Gabriel A. Brat"],
    368       "year": 2021,
    369       "relevance": "Discusses whether ML models replicate clinical judgment vs. provide independent predictive value, directly relevant to the label leakage concern."
    370     },
    371     {
    372       "title": "A framework for understanding label leakage in machine learning for health care",
    373       "authors": ["Sharon E. Davis", "Michael E. Matheny", "Suresh Balu", "Mark P. Sendak"],
    374       "year": 2023,
    375       "doi": "10.1093/jamia/ocad178",
    376       "relevance": "Provides the conceptual framework for label leakage in healthcare ML that this paper extends with empirical quantification."
    377     },
    378     {
    379       "title": "Variable generalization performance of a deep learning model to detect pneumonia in chest radiographs: A cross-sectional study",
    380       "authors": ["John R. Zech", "Marcus A. Badgeley", "Manway Liu"],
    381       "year": 2018,
    382       "relevance": "Demonstrates shortcut learning in medical imaging AI, part of the broader shortcut learning literature this paper contributes to."
    383     },
    384     {
    385       "title": "Shortcuts causing bias in radiology artificial intelligence: Causes, evaluation, and mitigation",
    386       "authors": ["Imon Banerjee", "Kamanasish Bhattacharjee", "John L. Burns"],
    387       "year": 2023,
    388       "relevance": "Reviews shortcut learning and bias in medical AI systems, directly relevant to the data leakage problem this paper identifies."
    389     },
    390     {
    391       "title": "Shortcut learning in medical AI hinders generalization: method for estimating AI model generalization without external data",
    392       "authors": ["Cong Ly Ong", "Balagopal Unnikrishnan", "Tony Tadic"],
    393       "year": 2024,
    394       "relevance": "Addresses shortcut learning as a generalization barrier in medical AI, complementary to this paper's focus on ICD code leakage."
    395     },
    396     {
    397       "title": "TRIPOD+AI statement: updated guidance for reporting clinical prediction models that use regression or machine learning methods",
    398       "authors": ["Gary S. Collins", "Karel Moons", "Paula Dhiman"],
    399       "year": 2024,
    400       "doi": "10.1136/bmj-2023-078378",
    401       "relevance": "Reporting guidelines for AI prediction models that the paper follows for train/validation/test split methodology."
    402     }
    403   ],
    404   "engagement_factors": {
    405     "practical_relevance": {
    406       "score": 2,
    407       "justification": "Directly actionable for anyone building clinical prediction models on EHR data: do not use same-admission ICD codes as features."
    408     },
    409     "surprise_contrarian": {
    410       "score": 2,
    411       "justification": "The finding that 40% of published MIMIC studies contain this basic methodological flaw challenges confidence in healthcare AI literature quality."
    412     },
    413     "fear_safety": {
    414       "score": 2,
    415       "justification": "Raises patient safety concerns: clinically deployed models built on leaked features would produce useless predictions for real-time clinical decision-making."
    416     },
    417     "drama_conflict": {
    418       "score": 2,
    419       "justification": "Effectively calls out a large fraction of the healthcare AI literature as methodologically flawed, with AUROC inflation rendering models clinically useless."
    420     },
    421     "demo_ability": {
    422       "score": 1,
    423       "justification": "Code is on GitHub for replication, but the paper is a methodology critique rather than a usable tool."
    424     },
    425     "brand_recognition": {
    426       "score": 1,
    427       "justification": "University of Chicago affiliation is respected but not a household name; MIMIC is well-known within healthcare AI but not to general audiences."
    428     }
    429   }
    430 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs