scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24755B)
      1 {
      2   "paper": {
      3     "title": "ClinNoteAgents: An LLM Multi-Agent System for Predicting and Interpreting Heart Failure 30-Day Readmission from Clinical Notes",
      4     "authors": ["Rongjia Zhou", "Chengzhuo Li", "Carl Yang", "Jiaying Lu"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2512.07081",
      8     "doi": "10.48550/arXiv.2512.07081"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The study uses the publicly available MIMIC-III database (ref 29), which is a standard public dataset. The data source is clearly identified and accessible."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. The paper states LLM agents used Qwen3-14B and Qwen3-8B but does not specify library versions, hardware, or dependency details."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology is described at a high level but lacks the specificity needed for exact reproduction."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Table 5 reports 95% confidence intervals for odds ratios (e.g., Age OR 95% CI: 1.002–1.015; Weight OR 95% CI: 0.643–0.942). However, the classification results in Figure 2 report only point estimates for AUROC without confidence intervals."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Logistic regression p-values are reported in Table 5 and chi-square test p-values in Table 6 for association analyses. Statistical tests are appropriately applied to categorical vs. continuous variables."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Table 5 reports odds ratios with 95% CIs for logistic regression results (e.g., Weight OR=0.778, Age OR=1.008). Chi-square statistics are reported in Table 6. These provide effect size context for the association analyses."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No power analysis or justification for the sample size (2,065 patients, 3,544 notes) is provided. The cohort is defined by the MIMIC-III filtering criteria but there is no discussion of whether this sample size is adequate for the analyses performed."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "Classification results (AUROC values in Figure 2) are reported as single-run point estimates with no standard deviation, variance, or spread across multiple runs. No mention of multiple experimental runs or cross-validation folds with variance."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper compares three classifiers (TF-IDF+LR, ClinicalBERT, LoRA-finetuned Qwen3-8B) across four input conditions (raw notes, overall summary, no-number summary, structural extraction). Raw discharge notes serve as the baseline for summarization comparisons."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "ClinicalBERT (2020) is now 5 years old. No comparison against more recent clinical NLP models or state-of-the-art HF readmission prediction methods is included. The paper does not justify why ClinicalBERT is still an appropriate baseline."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper effectively ablates the summarization component by comparing four input representations (raw, overall summary, no-number summary, structural extraction) across three classifiers, showing the contribution of different summarization strategies."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "For classification, only AUROC is reported (Figure 2). For extraction, multiple metrics are used (coverage, conditional accuracy, MAE, MAPE), but the core readmission prediction task uses a single metric."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No human expert evaluation of the system's outputs (extracted risk factors, summaries, predictions) is included. The extraction is compared against structured EHR fields as surrogate ground truth, and diagnosis extraction uses LLM-as-a-judge, but no clinician review of outputs is performed."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The paper does not describe any train/test/validation split for the classification experiments. There is no mention of held-out test sets, cross-validation, or how data was partitioned for the readmission prediction evaluation."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Table 2 provides per-variable breakdowns for extraction accuracy across vitals and charted SDOH. Tables 5 and 6 provide per-variable association analyses. Figure 2 breaks down classification results by summarization method and classifier type."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The Discussion section discusses cases where extraction failed or performed poorly: Height (68.50%) and Weight (57.90%) had lower accuracy due to heterogeneous reporting formats and unit-conversion issues. The diagnosis extraction gap is also discussed."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper explicitly reports that 'summarization did not improve classification performance as we initially expected' and that most SDOH variables commonly reported as influential were not statistically significant in their cohort."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims 'strong performance in extracting risk factors,' 'identifying key contributing factors,' and 'predicting readmission risk,' which are supported by Tables 2, 5, 6, and Figure 2. The claims are reasonably hedged."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper uses language like 'Age and BP are positively associated with HF readmission risk' which is correlational. However, the abstract claims the system 'reduces reliance on structured fields' and 'provides a scalable approach' — implying causal improvement. The association analyses (logistic regression, chi-square) are observational and cross-sectional, not causal."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper tests on MIMIC-III (a single US ICU dataset) but the title and abstract make broad claims about 'data-limited healthcare systems' and 'developing countries.' The Discussion notes that 'evaluating patients across multiple sites may be needed' but the title and framing significantly overreach the single-dataset evidence."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The Discussion offers some interpretation of why certain variables were not significant (under-documentation of SDOH, LLM extraction limitations) but does not systematically consider alternative explanations for the main results, such as whether the summarization performance preservation is due to redundancy in clinical notes rather than quality of summarization."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper specifies 'Qwen3-14B' and 'Qwen3-8B' and 'ClinicalBERT' but does not provide specific version identifiers, snapshot dates, or checkpoint hashes. 'Qwen3-14B' is a model family name, not a precise version."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper provides actual prompt text for the risk factor extractor (with output schema example), the overall summary prompt, and the no-number summary prompt. These are concrete prompt texts, not just descriptions."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No temperature, top-p, max tokens, or other LLM inference parameters are reported. For ClinicalBERT and LoRA fine-tuning, no learning rate, batch size, or training hyperparameters are specified. The k-medoids clustering used k=200 but LLM hyperparameters are absent."
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The multi-agent pipeline is described with three distinct agents (Extractor, Normalizer, Summarizer) and their workflow is outlined in Figure 1. The two-stage normalization process (normalizer generates categories, labeler assigns them) is described in detail."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The data selection pipeline is described: ICD-9 codes used for HF patient identification, readmission pair construction from sequential admissions, and the labeling logic (yi=1 if interval ≤30 days). Table 1 provides cohort summary statistics."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "A 'Limitations' subsection is present in the Discussion section, discussing three specific limitations: indirect evaluation of extracted risk factors, lack of performance gains from summarization, and single-site evaluation."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The Limitations section discusses specific threats: reliance on structured EHR as surrogate ground truth for extraction evaluation, LLM-as-judge limitations for diagnosis validation, and the need for multi-site evaluation. These are specific to this study."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "While the Limitations section acknowledges multi-site evaluation is needed, the paper does not explicitly state what the results do NOT show. The broad framing about 'developing countries' and 'data-limited healthcare systems' is not bounded by explicit statements about what settings the results do not apply to."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The study uses MIMIC-III, which is publicly available (with credentialed access). The underlying patient data can be independently verified by other researchers with MIMIC-III access."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Data collection is described: MIMIC-III database, HF patients identified via specific ICD-9 codes (listed explicitly), readmission pairs constructed by linking sequential admissions, discharge summaries used as input notes."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants were recruited. The study uses a standard public clinical database (MIMIC-III) as its data source."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The pipeline from MIMIC-III to final analysis is documented: ICD-9 filtering → readmission pair construction → discharge note extraction → LLM processing (extraction, normalization, summarization) → statistical analysis/classification. Table 1 provides cohort statistics."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding source or acknowledgments section is present in the paper. There is no mention of grants, sponsors, or funding agencies."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "All authors are listed as affiliated with Emory University. The affiliations are clearly stated at the top of the paper."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of funding disclosure is itself a transparency gap."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is included in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The paper uses Qwen3-14B and Qwen3-8B (LoRA fine-tuned) and ClinicalBERT for readmission prediction on MIMIC-III data. No training data cutoff dates are stated for any of these models. MIMIC-III data could be in their training corpora."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No discussion of whether MIMIC-III data (a widely-used public dataset) may have appeared in the training data of Qwen3 or ClinicalBERT. This is a significant concern given MIMIC-III's ubiquity in ML research."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "MIMIC-III has been publicly available since 2016 and is widely used in ML research. The paper does not address the possibility that Qwen3 models may have been exposed to MIMIC-III-derived content during pre-training, which could inflate extraction and classification performance."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants. The study performs secondary analysis of de-identified clinical records from MIMIC-III."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants. MIMIC-III access requires credentialing but not IRB approval for secondary analysis of de-identified data. The paper discusses ethical considerations regarding data handling."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in the study design. Patient demographics (age, gender) are reported as cohort characteristics in Table 1, but these are properties of the dataset, not study participants."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants were recruited. Cohort selection criteria (ICD-9 codes) are described for the clinical dataset but this is data filtering, not participant recruitment."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants and no experimental conditions requiring randomization."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants and no experimental conditions requiring blinding."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No inference costs, token counts, API costs, or wall-clock time are reported despite the system making multiple LLM calls per note (extraction, normalization, labeling, summarization) across 3,544 notes. The paper claims the approach is 'scalable' without quantifying cost."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No computational budget, GPU hours, hardware specifications, or total compute requirements are reported. The paper mentions models were 'deployed locally within secure computing environments' but provides no details."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "ClinNoteAgents achieves high extraction fidelity for clinical variables, with conditional accuracy above 84% for most vital signs.",
    287       "evidence": "Table 2 shows conditional accuracy ranging from 57.90% (Weight) to 94.16% (SpO2), with 6 of 8 vitals above 84%. MAE and MAPE metrics are also reported.",
    288       "supported": "moderate"
    289     },
    290     {
    291       "claim": "LLM-generated summaries preserve most predictive signal despite 60-90% text reduction.",
    292       "evidence": "Figure 2 shows AUROC drops from 0.6535 (raw) to 0.6434 (no-number summary, 61% reduction) for LR, and to 0.5866 (overall summary, 83% reduction). Performance is retained but some drops are non-trivial.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "Age, weight, and blood pressure are significantly associated with HF 30-day readmission.",
    297       "evidence": "Table 5 reports p=0.008 for Age, p=0.010 for Weight, p<0.001 for BP SYS, p=0.037 for BP DIA, with odds ratios and 95% CIs.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "Housing is the only statistically significant SDOH variable associated with HF readmission.",
    302       "evidence": "Table 6 reports chi-square=21.13, p=0.012 for Housing, with all other SDOH variables having p>0.05.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "ClinNoteAgents provides a scalable and interpretable approach to note-based HF readmission risk modeling in data-limited healthcare systems.",
    307       "evidence": "No scalability metrics, cost data, or evaluation in data-limited settings is provided. The claim in the abstract is not supported by evidence in the paper.",
    308       "supported": "weak"
    309     },
    310     {
    311       "claim": "LLM-based diagnosis extraction achieves conditional accuracy of 62.67% with mean similarity score of 3.04.",
    312       "evidence": "Table 3 reports these metrics from LLM-as-a-judge evaluation comparing LLM-extracted diagnoses to ICD-9 codes.",
    313       "supported": "moderate"
    314     }
    315   ],
    316   "methodology_tags": ["benchmark-eval", "case-study"],
    317   "key_findings": "ClinNoteAgents is a multi-agent LLM framework using Qwen3-14B to extract structured risk factors from clinical discharge notes for heart failure 30-day readmission prediction. On MIMIC-III (2,065 patients), the extraction agents achieve 84-94% conditional accuracy for most vital signs against structured EHR ground truth. Logistic regression identifies age, weight, and blood pressure as significant predictors, while housing is the only significant SDOH factor. LLM summarization preserves most predictive signal despite 60-90% text reduction, though it does not improve over raw notes.",
    318   "red_flags": [
    319     {
    320       "flag": "No train/test split described for classification",
    321       "detail": "The readmission prediction experiments (Figure 2) report AUROC values without describing how the data was split into training and test sets, whether cross-validation was used, or how model selection was performed. This makes the classification results unverifiable."
    322     },
    323     {
    324       "flag": "Overclaimed generalizability",
    325       "detail": "The paper tests only on MIMIC-III (a US ICU dataset) but frames the contribution around 'data-limited healthcare systems' and 'developing countries in Asia and Africa.' No evidence from non-US or resource-limited settings is presented."
    326     },
    327     {
    328       "flag": "No uncertainty quantification for classification",
    329       "detail": "AUROC values are reported as single numbers without confidence intervals, standard deviations, or multiple-run results. With a 35% readmission rate and moderate AUROC values (0.55-0.65), the differences between methods may not be statistically significant."
    330     },
    331     {
    332       "flag": "MIMIC-III contamination risk",
    333       "detail": "MIMIC-III has been publicly available since 2016 and is one of the most widely-used clinical datasets in ML. Qwen3 models may have been exposed to MIMIC-III-derived content during pre-training, which could inflate extraction performance. This is not discussed."
    334     },
    335     {
    336       "flag": "LLM-as-a-judge without validation",
    337       "detail": "Diagnosis extraction is evaluated using an LLM-as-a-judge approach (ref 32), but the judge itself is not validated against human expert ratings. This creates a circular dependency where LLMs evaluate LLM outputs."
    338     },
    339     {
    340       "flag": "No inference cost despite scalability claims",
    341       "detail": "The paper claims the approach is 'scalable' but reports no inference costs, token counts, or processing time despite running multiple LLM agents per note across 3,544 notes."
    342     }
    343   ],
    344   "cited_papers": [
    345     {
    346       "title": "Large language models to identify social determinants of health in electronic health records",
    347       "authors": ["M. Guevara", "S. Chen", "S. Thomas"],
    348       "year": 2024,
    349       "relevance": "Directly relevant as prior work on using LLMs for structured information extraction from clinical notes."
    350     },
    351     {
    352       "title": "ClinicalBERT: Modeling Clinical Notes and Predicting Hospital Readmission",
    353       "authors": ["K. Huang", "J. Altosaar", "R. Ranganath"],
    354       "year": 2020,
    355       "arxiv_id": "1904.05342",
    356       "relevance": "Key baseline model used in the evaluation, represents transformer-based clinical NLP approach."
    357     },
    358     {
    359       "title": "DistillNote: LLM-Based Clinical Note Summaries Improve Heart Failure Diagnosis",
    360       "authors": ["H. O. Boll", "A. O. Boll", "L. P. Boll"],
    361       "year": 2025,
    362       "arxiv_id": "2506.16777",
    363       "relevance": "Directly related work on LLM summarization of clinical notes for downstream clinical tasks."
    364     },
    365     {
    366       "title": "SDoH-GPT: Using Large Language Models to Extract Social Determinants of Health",
    367       "authors": ["B. Consoli", "H. Wang", "X. Wu"],
    368       "year": 2025,
    369       "relevance": "Prior work on using LLMs specifically for SDOH extraction from clinical text."
    370     },
    371     {
    372       "title": "Qwen3 technical report",
    373       "authors": ["Qwen Team"],
    374       "year": 2025,
    375       "arxiv_id": "2505.09388",
    376       "relevance": "Technical report for the primary LLM used in the ClinNoteAgents system."
    377     },
    378     {
    379       "title": "LLM Judge: Building Automatic Evaluators with Large Language Models",
    380       "authors": ["Hugging Face"],
    381       "year": 2024,
    382       "relevance": "Framework used for LLM-as-a-judge evaluation of diagnosis extraction quality."
    383     },
    384     {
    385       "title": "Hallucinations and Key Information Extraction in Medical Texts: A Comprehensive Assessment of Open-Source Large Language Models",
    386       "authors": ["A. B. Das", "S. Ahmed", "S. K. Sakib"],
    387       "year": 2025,
    388       "relevance": "Relevant to LLM reliability concerns in clinical information extraction, cited for hallucination evidence."
    389     },
    390     {
    391       "title": "Mining Social Determinants of Health for Heart Failure Patient 30-Day Readmission via Large Language Model",
    392       "authors": ["M. Shao", "Y. Kang", "X. Hu"],
    393       "year": 2025,
    394       "relevance": "Direct predecessor work from same group on LLM-based SDOH mining for HF readmission."
    395     },
    396     {
    397       "title": "MIMIC-III, a freely accessible critical care database",
    398       "authors": ["A. E. Johnson", "T. J. Pollard", "L. Shen"],
    399       "year": 2016,
    400       "relevance": "The primary dataset used in this study; widely used benchmark for clinical NLP research."
    401     }
    402   ]
    403 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs