scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25622B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Diagnostic Codes in AI prediction models and Label Leakage of Same-admission Clinical Outcomes",
      6     "authors": [
      7       "Bashar Ramadan",
      8       "Ming-Chieh Liu",
      9       "Michael C. Burkhart",
     10       "William F Parker",
     11       "Brett K. Beaulieu-Jones"
     12     ],
     13     "year": 2025,
     14     "venue": "medRxiv",
     15     "arxiv_id": null,
     16     "doi": "10.1101/2025.08.09.25333360"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims are supported: ICD codes finalized after discharge (MIMIC documentation), 40.2% prevalence (37/92 in systematic review), AUROC 0.97-0.98 (Table 1A), top codes clinically unavailable (identified 'brain death', 'palliative care').",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Main claim 'ICD codes inflate performance' supported by showing high AUROC with ICD-only models, then demonstrating top predictive codes are clinically unavailable at prediction time (label leakage mechanism).",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Analysis explicitly limited to MIMIC-III/IV data. Speculation about private datasets and broader problem acknowledged as beyond their evidence: 'unlikely that this problem is isolated to MIMIC...reflects a broader challenge.'",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Discusses alternative interpretations: some codes known early (broken limbs, burns), codes from prior admissions, clinician documentation focus signaling stability (external hemorrhoids anomaly), and possibility of timestamped codes being acceptable.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Clearly distinguishes between what's measured (AUROC on test set) and what's claimed (clinical usability). Explicitly states high AUROC 'renders the model incapable of making clinically useful predictions in real-time.'",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Dedicated limitations section in Discussion: 'Both analyses in our study are limited because they only the benchmark MIMIC dataset.' Also discusses lack of timestamps in MIMIC and uncertainty about private datasets.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats: (1) MIMIC-only analysis; (2) no audit log or timestamps available; (3) systematic review limited to top-cited papers, potentially introducing citation bias; (4) can't estimate frequency on private institutional datasets.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Scope explicitly bounded: 'limited because they only the benchmark MIMIC dataset', cannot make claims about private institutional data, analysis covers MIMIC-III and MIMIC-IV for same-admission outcomes.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding source disclosed anywhere in the manuscript. Preprint format may be incomplete, but as presented, no funding statement appears.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors from University of Chicago (Center for Computational Medicine and Clinical AI, Department of Medicine). No apparent affiliation with MIMIC creators or evaluated products.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "No funding disclosed, assuming unfunded or independently funded. Authors have no apparent financial stake in MIMIC or prediction model companies.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement provided. Standard 'Competing Interests' section absent from the manuscript.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms clearly defined: 'label leakage' explained with appendicitis example, 'ICD codes' described as post-discharge finalized, 'data leakage' defined, 'same-admission outcomes' clear from context.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Dual contribution explicit: (1) quantify prevalence of ICD-code label leakage in published MIMIC models (40.2%), (2) demonstrate the impact via high-accuracy mortality models using only ICD codes.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Engages with Davis et al (2023) framework for label leakage, cites shortcut learning literature (Zech, Banerjee, Nauta), positions work as extending prior understanding to quantify breadth of a known problem.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "States 'Full source code is available on Github (https://github.com/bbj-lab/data-leakage)' — code release promised, though environment and dependencies not fully specified in paper.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Uses MIMIC-IV v2.2, described as 'publicly available deidentified electronic healthcare record database.' Standard public benchmark used unmodified.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "References 'scikit-learn', 'random forest', 'XGBoost' but provides no requirements.txt, Dockerfile, or Python version specification. Dependencies listed only via citations.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Methods describe preprocessing and model training clearly, GitHub link provided, but no step-by-step instructions in paper. Insufficient detail for reproduction without accessing repo.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "AUROC results reported as ranges ('0.97-0.98') without confidence intervals. No error bars on figures. Balanced accuracy reported without spreads.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "Logistic regression: p-values reported with Benjamini–Hochberg FDR correction (p<0.05). Train-test split methodology sound. Random forest/XGBoost lack significance testing but different paradigm.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "AUROC and balanced accuracy are effect sizes. Odds ratios reported for logistic regression features. Adequate for classification task.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Large sample (422,534 admissions, 180,640 patients) but no explicit justification, power analysis, or argument for sufficiency. Size appears adequate but not justified a priori.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Only age reported with SD (58.69±19.23). Single train-val-test split with no cross-validation. No variance across multiple runs or folds reported.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "Compares logistic regression vs random forest vs XGBoost (model types, not baselines). No comparison to published MIMIC mortality models or clinically established baselines.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": false,
    187           "answer": false,
    188           "justification": "N/A — no baseline models included for comparison.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "Models trained on age + sex + ICD codes. Missing critical ablation: what happens with age + sex only? Cannot quantify ICD codes' actual contribution.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "AUROC and balanced accuracy reported. Systematic review uses counts and percentages. Two metrics for main task.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "N/A — retrospective EHR analysis, no human evaluation of model outputs.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Explicit train-validation-test split: 70%-10%-20%. Results reported on held-out test set per TRIPOD-AI+ guidelines.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "Binary classification task (mortality yes/no). No stratified analysis by subgroup, diagnosis category, or patient demographics.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Anomaly discussed: 'external hemorrhoids without complications' important to random forest, explanation offered (signals clinician focus on stability). Limited but present.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": false,
    230           "justification": "No negative results reported; all models achieve high AUROCs. The point is high metrics are misleading, but explicit negative findings absent.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Models cited (scikit-learn, XGBoost by reference number) but specific versions not provided. Snapshot dates or version numbers absent.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": false,
    243           "answer": false,
    244           "justification": "N/A — not a language model or prompt-based study.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "States 'tuning hyperparameters in validation set' but specific values (regularization, tree depth, learning rate, etc.) not reported.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "N/A — no agentic scaffolding or complex system structure.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Preprocessing documented: ICD-10 converted to ICD-9, codes with variance <0.0001 or covariance >0.8 removed, age and sex retained as features.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "MIMIC-IV is publicly available. Processed dataset not explicitly released but underlying data accessible to verify findings.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "MIMIC-IV collection described: 'deidentified electronic healthcare record database...Beth Israel Deaconess Medical Center between 2008 and 2019', ICU and ED admissions.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "Inclusion: 'All admissions with ICD codes were included in our study, with less than 1% excluded.' Exclusion criterion for <1% not detailed but overall approach stated.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Pipeline documented: MIMIC data → preprocessing (ICD conversion, variance filtering) → train-val-test split (70%-10%-20%), excluding patient overlap between sets.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "N/A — not evaluating pre-trained models on benchmarks, training from scratch on MIMIC.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "Explicitly addressed: 'excluding patients from the validation and test sets who also had admissions in the training set.' Good practice for temporal contamination.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "Entire paper addresses contamination via label leakage (ICD codes finalized post-discharge). Mechanism and impact thoroughly discussed.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "N/A — no human subjects, retrospective EHR analysis.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "N/A — MIMIC is deidentified, no IRB approval mentioned or needed (preprint may omit).",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "N/A — no human subjects enrolled.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "N/A — admissions criteria stated but no human subject inclusion/exclusion.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "N/A — observational retrospective study, no randomization.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "N/A — no human subjects or blinding applicable.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "N/A — no human participant follow-up or dropout.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": false,
    359           "answer": false,
    360           "justification": "N/A — not a system deployment study. Inference cost on standard ML models not relevant focus.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": false,
    365           "answer": false,
    366           "justification": "N/A — computational budget not applicable or relevant for retrospective analysis on standard hardware.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "ICD diagnostic codes are only finalized after hospital discharge and are unavailable during patient admission",
    375       "evidence": "MIMIC-III documentation states codes arise 'from patient discharges'; MIMIC-IV states 'determined by trained professionals after reviewing signed patient notes.' Paper provides example of post-hoc code assignment.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "40.2% of published prediction models using MIMIC for same-admission outcomes include ICD codes as features despite warnings against this practice",
    380       "evidence": "Systematic review of 100 papers: 92 performed same-admission prediction, 37 used ICD codes (37/92 = 40.2%, Figure 2).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Models trained solely on ICD codes can predict in-hospital mortality with high accuracy (AUROC 0.97-0.98)",
    385       "evidence": "Table 1A: logistic regression AUROC 0.98, random forest 0.97, XGBoost 0.97 on held-out test set with age + sex + ICD-9 features only.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "The most predictive ICD codes for mortality prediction are clinically unavailable at the time a prediction must be made",
    390       "evidence": "Figure 1B-C identify top codes: 'brain death,' 'cardiac arrest,' 'Do Not Resuscitate status,' 'Encounter for palliative care' — all post-discharge or end-of-life indicators finalized after admission outcome is known.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Using ICD codes for same-admission outcome prediction renders models clinically useless despite high research metrics",
    395       "evidence": "Discussed in conclusion and discussion: high AUROC is artificial inflation from label leakage, models 'could never be deployed in real-world clinical environments' because codes unavailable at prediction time.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "This label leakage problem is likely prevalent beyond MIMIC in other healthcare ML research",
    400       "evidence": "Speculation in discussion: 'It is very unlikely that this problem is isolated to MIMIC database work but reflects a broader challenge in healthcare machine learning research.' Speculative, not empirically demonstrated.",
    401       "supported": "weak"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "observational",
    406     "benchmark-eval",
    407     "systematic-review"
    408   ],
    409   "key_findings": "Approximately 40.2% of published prediction models using MIMIC for same-admission outcomes incorrectly use ICD diagnostic codes as features, despite explicit dataset documentation stating these codes are finalized only after hospital discharge. Paradoxically, models trained solely on these post-discharge codes achieve remarkably high accuracy (AUROC 0.97–0.98) for mortality prediction, with the most predictive codes (e.g., \"brain death,\" \"palliative care\") being inherently unavailable at decision time. This represents severe label leakage that renders ostensibly accurate models clinically useless, suggesting a widespread methodological failure in healthcare AI research.",
    410   "red_flags": [
    411     {
    412       "flag": "No ablation study",
    413       "detail": "Missing critical ablation comparing age+sex+ICD codes vs. age+sex alone. Cannot quantify ICD contribution or validate that leakage drives the high AUROC."
    414     },
    415     {
    416       "flag": "Single train-test split, no cross-validation",
    417       "detail": "Only 70-10-20 split reported. No k-fold or cross-validation means variance not estimated; reported AUROCs (0.97–0.98) lack confidence intervals."
    418     },
    419     {
    420       "flag": "Confidence intervals not reported",
    421       "detail": "AUROC ranges reported ('0.97–0.98') appear to be min-max across three model types, not actual CIs. Precision unjustified given single split."
    422     },
    423     {
    424       "flag": "Systematic review citation bias",
    425       "detail": "Sorted results by citations-per-year and stopped at n=100. Risk of selection bias toward high-visibility papers; inter-rater reliability not assessed."
    426     },
    427     {
    428       "flag": "Hyperparameters not specified",
    429       "detail": "Tuning performed in validation set but specific values (regularization, tree depth, learning rate) not reported, limiting reproducibility."
    430     },
    431     {
    432       "flag": "No comparison to published baselines",
    433       "detail": "Cannot assess whether their leakage finding explains the inflated performance of existing MIMIC models or if other factors also contribute."
    434     },
    435     {
    436       "flag": "Funding and competing interests not disclosed",
    437       "detail": "No funding source stated; no competing interests section. May be preprint limitation but transparency concern."
    438     },
    439     {
    440       "flag": "Speculation on private data without evidence",
    441       "detail": "Assumes label leakage is worse on private institutional datasets but provides no data; reasonable inference but presented as established fact."
    442     }
    443   ],
    444   "cited_papers": [
    445     {
    446       "title": "A framework for understanding label leakage in machine learning for health care",
    447       "authors": "Davis et al.",
    448       "year": 2023,
    449       "venue": "Journal of the American Medical Informatics Association",
    450       "relevance": "Foundational framework for label leakage; this paper applies and quantifies the problem in a specific high-impact context."
    451     },
    452     {
    453       "title": "Variable generalization performance of a deep learning model to detect pneumonia in chest radiographs",
    454       "authors": "Zech et al.",
    455       "year": 2018,
    456       "venue": "PLOS Medicine",
    457       "relevance": "Example of shortcut learning and domain shift in medical AI; similar methodological concern to label leakage."
    458     },
    459     {
    460       "title": "Shortcuts causing bias in radiology artificial intelligence",
    461       "authors": "Banerjee et al.",
    462       "year": 2023,
    463       "venue": "Journal of the American College of Radiology",
    464       "relevance": "Systematic review of shortcut learning in medical imaging; parallel problem to diagnostic code leakage."
    465     },
    466     {
    467       "title": "Scalable and accurate deep learning for electronic health records",
    468       "authors": "Rajkomar et al.",
    469       "year": 2018,
    470       "venue": "arXiv",
    471       "relevance": "Influential deep learning model for EHR; example of high-profile MIMIC-based work that may use ICD codes."
    472     },
    473     {
    474       "title": "Machine learning for patient risk stratification: standing on, or looking over, the shoulders of clinicians?",
    475       "authors": "Beaulieu-Jones et al.",
    476       "year": 2021,
    477       "venue": "npj Digital Medicine",
    478       "relevance": "Critical perspective on ML clinical utility; same lead author as current paper, earlier work on deployment validity."
    479     }
    480   ],
    481   "engagement_factors": {
    482     "practical_relevance": {
    483       "score": 2,
    484       "justification": "Identifies a real problem affecting 40% of published models, highly relevant to practitioners and model developers. Limited scope to MIMIC reduces breadth but impact on healthcare ML practice is significant."
    485     },
    486     "surprise_contrarian": {
    487       "score": 3,
    488       "justification": "Directly challenges validity of ~40% of published models in a major benchmark dataset. Strongly contrarian finding that established practice is methodologically invalid despite high reported metrics."
    489     },
    490     "fear_safety": {
    491       "score": 2,
    492       "justification": "Raises clinical safety concerns: AI models appearing rigorous but actually clinically useless due to label leakage. Healthcare deployment failure scenario, but limited to same-admission prediction task."
    493     },
    494     "drama_conflict": {
    495       "score": 3,
    496       "justification": "Strong conflict angle: 40% of published research ignores explicit dataset warnings, leading to invalid models. Citation of documented guidance being ignored creates clear drama and calls out community practice."
    497     },
    498     "demo_ability": {
    499       "score": 1,
    500       "justification": "Requires MIMIC database access (restricted registration), so not easily reproducible by general audience. Code promised but results not independently verifiable without credentials."
    501     },
    502     "brand_recognition": {
    503       "score": 1,
    504       "justification": "University of Chicago is respected but not in top tier of AI research reputation. Authors not household names in AI/ML community (lead author working in healthcare domain, not main labs)."
    505     }
    506   },
    507   "hn_data": {
    508     "threads": [],
    509     "top_points": 0,
    510     "total_points": 0,
    511     "total_comments": 0
    512   }
    513 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs