scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (30144B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Towards a holistic framework for multimodal LLM in 3D brain CT radiology report generation",
      6     "authors": [
      7       "Cheng-Yi Li",
      8       "Kao-Jung Chang",
      9       "Cheng-Fu Yang",
     10       "Hsin-Yu Wu",
     11       "Wenting Chen"
     12     ],
     13     "year": 2024,
     14     "venue": "Nature Communications",
     15     "arxiv_id": "2407.02235",
     16     "doi": "10.1038/s41467-025-57426-0"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All quantitative claims in the abstract (BLEU-1=44.35, FORTE F1=0.71, CQ500 midline shift accuracy=0.91, 74% Turing test indistinguishability) are backed by corresponding results sections and extended data tables.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims that CVIT improves over RVIT are supported by controlled ablation with four fine-tuning variants compared via Mann-Whitney U tests; negation removal and sentence pairing improvements are quantified against explicit counterfactuals.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The limitations section explicitly bounds scope: model trained on degeneration-oriented Alzheimer's data fails on malignancy and acute trauma in CQ500, and results are not claimed to extend beyond 3D brain CT.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss alternative explanations for why FORTE correlates differently from traditional metrics, nor why Turing test success may reflect surface stylistic mimicry rather than diagnostic accuracy.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly argues that traditional NLP metrics are inadequate proxies for clinical quality and proposes FORTE as a clinically-grounded alternative, acknowledging the surface-text vs. diagnostic-content gap.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "A dedicated limitations paragraph appears in the Discussion section listing three specific limitations: no SOTA MLLM counterpart to benchmark against, degeneration-oriented training data, and no model backbone comparison.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats identified: BrainGPT fails on CQ500 malignancy/acute trauma due to training distribution bias; single-hospital dataset from one patient population (Alzheimer's elderly) limits generalizability.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Scope is bounded to 3D brain CT from a single institution with Alzheimer's-oriented data; the paper does not claim results extend to other imaging modalities or disease populations.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Funding fully disclosed: Taiwan NSTC grants (NSTC 112-2321-B-A49-007, NSTC 111-2320-B-A49-028-MY3, others) and Taipei Veterans General Hospital grants listed in Acknowledgements.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All 13 authors have institutional affiliations listed at the top of the paper; UCLA, NYMU, and TPEVGH affiliations are transparent.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Funders are government (NSTC) and a public academic hospital; no commercial entity has a financial stake in the outcome, though TPEVGH provided the training data.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Explicit competing interests statement present: 'The authors declare no competing interests.'",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "MLLM, CVIT, RVIT, and FORTE are all explicitly defined; the brain CT report generation task and its clinical context (list-by-list differential diagnosis format) are explained in the introduction.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit numbered contributions stated in the Introduction: (1) 3D-BrainCT dataset, (2) BrainGPT models via CVIT, (3) FORTE evaluation metric.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Paper compares against LLaVA-Med, Med-PaLM M, Med-Gemini-3D, CT2Rep, and CheXpert CE; discusses why each is inadequate for 3D brain CT and how BrainGPT addresses those gaps.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "GitHub repository (https://github.com/charlierabea/FORTE) is provided with code and model weights for the best BrainGPT-keyword model.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "Primary training data (3D-BrainCT from TPEVGH) cannot be released due to IRB regulations; only the CQ500 external validation set is accessible via Qure.ai.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Training hardware (2×NVIDIA A100) and key libraries (SentenceTransformer, MS-COCO toolkit) are mentioned, but no requirements.txt, Dockerfile, or pinned dependency versions are provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Code is on GitHub but the paper provides only high-level training descriptions; step-by-step instructions sufficient to reproduce reported metric values are not included.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Results are reported as point estimates throughout; Mann-Whitney U tests provide p-values but no confidence intervals or error bars are reported for any main metric scores.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "Mann-Whitney U tests are used throughout for all comparative claims between BrainGPT variants and the baseline Otter model, with p-values reported.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Absolute metric values and percentage improvements are reported (e.g., negation removal: BLEU-4 +57.26%, FORTE F1 gain +0.153); Pearson r reported for correlation analyses.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Dataset size (18,885 scans) is described by what was available retrospectively; no power analysis or formal justification for adequacy of test set size is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "BLEU/METEOR/ROUGE/FORTE scores are reported as point estimates with no standard deviations; score distributions are shown in figures but not tabulated with spread measures.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "The untuned baseline Otter model is included as comparison throughout all evaluations, scoring BLEU-4=0 and CIDEr-R=5.9.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Otter (2023) is the contemporary foundation model being fine-tuned; comparisons to Med-PaLM M and LLaVA-Med (both 2023–2024) are contemporary.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Four BrainGPT variants (plain, example, template, keyword) representing increasing clinical instruction sophistication constitute a systematic ablation of instruction design choices.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "BLEU-1, BLEU-4, METEOR, ROUGE-L, CIDEr-R, and FORTE with 4 sub-categories (degree, landmark, feature, impression) are all reported.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Linguistic-embedded Turing test with 11 physicians (2 radiologists, 2 neurologists, 7 others) evaluating 6 report cases and providing qualitative linguistic rationales.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Internal test set (1,938 patients, ~87K slices) and external CQ500 validation set (133 scans) are held out and not used during fine-tuning.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "FORTE provides per-category breakdown (degree, landmark, feature, impression); CQ500 results broken down by mass effect, hemorrhage, and midline shift.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Failure modes explicitly discussed: 'interpretation spree' over-negation, misspelling of anatomical terms ('putmen'), and failure to caption malignancy/acute trauma features absent from training distribution.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Traditional metrics show no significant difference between instruction-tuning conditions (p>0.05); midline shift accuracy was only 0.35–0.38 before preprocessing intervention; BrainGPT-template vs keyword showed no significant difference.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "LLaMA-7B and CLIP ViT-L/14 are named but no specific checkpoint or release version is pinned; the Otter version is referenced only via citation without a version tag or hash.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "All four distinct instruction designs are shown in Extended Data Fig. 4 with example instructions for plain, in-context example, template, and keyword tuning.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "Only training duration (12 hours, 3 epochs) and hardware are mentioned; learning rate, batch size, optimizer, and other training hyperparameters are not reported.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Otter architecture described in detail: CLIP ViT-L/14 (frozen) + LLaMA-7B + trainable perceiver resampler + cross-gated attention layers; image-instruction-answer triplet formatting explained.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Slice sampling (24 slices/scan for Otter compatibility), sentence pairing via SentenceTransformer all-mpnet-base-v2, and negation removal are all described with implementation details.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Primary 3D-BrainCT dataset cannot be released due to IRB; CQ500 external validation data is publicly available but not the training data.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Data collection described: retrospective, 9,689 Alzheimer's patients at TPEVGH, January 2010–December 2022, IRB approval obtained (2023-10-002 BC), informed consent waived for retrospective deidentified data.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "Turing test physician participants are described by specialty (2 radiologists, 2 neurologists, 7 other licensed physicians) but recruitment method is not stated.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Full pipeline documented: raw CT scans → slice sampling (24 slices/scan) → image-instruction-answer triplet formatting → fine-tuning; CQ500 subset selection criteria (23–40 slices, non-contrast only) also documented.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoffs for the base LLaMA-7B and Otter models are not stated; contamination from public medical imaging data in base model pre-training is not discussed.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Internal test data is drawn from the same institution and time period (2010–2022) as training data; potential patient-level overlap between train/test splits is not discussed.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "CQ500 is a public dataset that could appear in LLaMA-7B's pre-training corpus; the paper does not address whether the base model was exposed to CQ500 before fine-tuning.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "No pre-registration is mentioned for either the retrospective patient data study or the Turing test physician evaluation.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": true,
    322           "justification": "IRB approval explicitly stated for patient data collection (2023-10-002 BC); informed consent waived for retrospective deidentified data.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": true,
    328           "justification": "Patient demographics: mean age 82.59 (SD 9.3), 56.4% male, Alzheimer's diagnosis. Turing test evaluators: 2 radiologists, 2 neurologists, 7 other licensed physicians.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": true,
    334           "justification": "CQ500 selection criteria stated (non-contrast only, 23–40 slices); training data defined as all Alzheimer's patients from TPEVGH 2010–2022.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": true,
    339           "answer": false,
    340           "justification": "No randomization described for Turing test case selection; six cases appear purposively selected to represent diverse diagnoses (lacunar infarct, SDH, atrophy, midline shift).",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": true,
    345           "answer": true,
    346           "justification": "Turing test is blinded by design: physicians evaluate report pairs without knowing which is BrainGPT vs. human; image context introduced only in the second assessment phase.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "All 11 physicians completed all 6 cases (66 total evaluations reported); no attrition occurred.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "Inference cost or per-report latency is not reported; only training time (12 hours) is mentioned.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Training compute stated: 12 hours on two NVIDIA A100 GPUs for 3 epochs; favorably compared to CT2Rep requiring 7 days on a single A100.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "All four BrainGPT models outperform baseline Otter on traditional NLP metrics for brain CT reports",
    375       "evidence": "Mann-Whitney U test p<0.01; baseline Otter scored BLEU-4=0 and CIDEr-R=5.9 vs BrainGPT-keyword CIDEr-R=153.3 after sentence pairing",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "CVIT models (template, keyword) outperform RVIT models (plain, example) in clinical captioning quality",
    380       "evidence": "FORTE F1 comparisons show p<0.001 (Mann-Whitney U); CIDEr-R shows ascending trend across plain→example→template→keyword (125.86→132.38→147.92→153.3)",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Traditional NLP metrics (BLEU, METEOR, ROUGE) fail to differentiate clinical quality in brain CT reports",
    385       "evidence": "Traditional metrics show no significant difference across instruction-tuning conditions (p>0.05); high intra-correlations (r>0.7) but low correlation with FORTE (r<0.5)",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "74% of BrainGPT-generated captions were indistinguishable from human-written reports in a Turing test",
    390       "evidence": "11 physicians evaluated 6 purposively selected cases (66 total evaluations); 74.24% of BrainGPT reports misidentified as human-written; drops to 56% when CT images are provided",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Negation removal significantly improves both traditional metrics and FORTE scores",
    395       "evidence": "BLEU-4 improved 57.26%, METEOR 24.97%, ROUGE-L 29.04%; overall FORTE average F1 gain 0.153; BrainGPT-keyword F1 from 0.576 to 0.71",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "BrainGPT achieves 0.91 accuracy for midline shift detection on external CQ500 dataset after negation removal",
    400       "evidence": "Zero-shot evaluation on CQ500 n=133 non-contrast scans; baseline without negation removal was only 0.35–0.38 for midline shift",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "FORTE captures clinically distinct information not measured by traditional NLP metrics",
    405       "evidence": "Pearson correlation analysis shows FORTE domains have r<0.5 with traditional metrics and lower intra-domain correlations (r<0.5) vs. traditional metrics (r>0.7 intra)",
    406       "supported": "moderate"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval",
    411     "case-study",
    412     "observational"
    413   ],
    414   "key_findings": "BrainGPT, fine-tuned from Otter using Clinical Visual Instruction Tuning (CVIT) on 18,885 3D brain CT scan-report pairs, achieves BLEU-1=44.35 and FORTE F1=0.71 on internal testing, with 74% of its reports indistinguishable from radiologist-written reports in a Turing test (dropping to 56% when CT images are provided). Traditional NLP metrics (BLEU/METEOR/ROUGE) fail to distinguish between instruction-tuning conditions (p>0.05), motivating the proposed FORTE metric which captures clinical keyword density across degree, landmark, feature, and impression dimensions. Preprocessing interventions (sentence pairing, negation removal) are critical: negation removal alone improves BLEU-4 by 57% and FORTE F1 by 0.15. The model fails on pathologies underrepresented in training (malignancy, acute trauma), and the single-hospital Alzheimer's-oriented training corpus substantially limits generalizability.",
    415   "red_flags": [
    416     {
    417       "flag": "Turing test sample tiny and non-random",
    418       "detail": "Only 11 physicians evaluated 6 purposively selected cases (66 total judgments); cases were handpicked to represent diverse diagnoses, not randomly sampled, enabling cherry-picking of cases where BrainGPT performs best."
    419     },
    420     {
    421       "flag": "Turing test dominated by non-radiologist evaluators",
    422       "detail": "Only 2 of 11 evaluators are radiologists (the actual domain experts); 7 are non-specialist licensed physicians, substantially inflating the 74% indistinguishability claim."
    423     },
    424     {
    425       "flag": "No variance for main metric results",
    426       "detail": "All BLEU/METEOR/ROUGE/FORTE scores reported as point estimates without standard deviations or confidence intervals; result stability across the test set is unverifiable."
    427     },
    428     {
    429       "flag": "Train/test contamination not addressed",
    430       "detail": "Internal test set drawn from the same institution and time period (2010–2022) as training data; patient-level overlap between splits is not reported or discussed."
    431     },
    432     {
    433       "flag": "Base model training cutoff unknown",
    434       "detail": "LLaMA-7B and Otter pre-training data cutoffs not stated; whether CQ500 (public dataset used for external validation) appeared in base model pre-training corpora is not addressed."
    435     },
    436     {
    437       "flag": "Key training hyperparameters missing",
    438       "detail": "Learning rate, batch size, optimizer, and regularization are not reported, making replication of the fine-tuning procedure difficult despite code release."
    439     },
    440     {
    441       "flag": "Single-institution single-population training data",
    442       "detail": "All 18,885 training scans from one hospital with Alzheimer's elderly patients (mean age 82.6); model confirmed to fail on CQ500 features absent from training distribution."
    443     }
    444   ],
    445   "cited_papers": [
    446     {
    447       "title": "LLaVA-Med: Training a Large Language-and-Vision Assistant for Biomedicine in One Day",
    448       "relevance": "Key prior work on medical MLLM fine-tuning; BrainGPT extends this paradigm to volumetric 3D CT report generation"
    449     },
    450     {
    451       "title": "Towards Generalist Biomedical AI (Med-PaLM M)",
    452       "relevance": "Competing medical MLLM with multimodal capabilities including CXR and single-slice CT; used as performance comparison"
    453     },
    454     {
    455       "title": "CT2Rep: Automated Radiology Report Generation for 3D Medical Imaging",
    456       "relevance": "Closest prior work on 3D CT report generation; BrainGPT compared favorably in training efficiency (12h vs 7 days)"
    457     },
    458     {
    459       "title": "Otter: A Multi-Modal Model with In-Context Instruction Tuning",
    460       "relevance": "Foundation model used for BrainGPT fine-tuning; provides multi-image in-context learning architecture"
    461     },
    462     {
    463       "title": "CheXpert: A Large Chest Radiograph Dataset with Uncertainty Labels and Expert Comparison",
    464       "relevance": "Defines CE evaluation metric that FORTE is designed to replace/complement for brain CT; motivates need for clinical evaluation metrics"
    465     },
    466     {
    467       "title": "Large language models encode clinical knowledge (Med-PaLM)",
    468       "relevance": "Establishes in-context example primings for medical QA; basis for the RVIT approach and 3-shot examples in BrainGPT-example"
    469     },
    470     {
    471       "title": "Advancing Multimodal Medical Capabilities of Gemini (Med-Gemini-3D)",
    472       "relevance": "Google's competing approach to 3D CT report generation with only 53% clinical validity in human evaluation; direct competitor"
    473     },
    474     {
    475       "title": "Deep learning algorithms for detection of critical findings in head CT scans: a retrospective study",
    476       "relevance": "Source of CQ500 external validation dataset used for zero-shot evaluation of BrainGPT generalization"
    477     },
    478     {
    479       "title": "MIMIC-CXR, a de-identified publicly available database of chest radiographs with free-text reports",
    480       "relevance": "Dominant existing dataset for radiology report generation; motivates why a 3D brain CT dataset is needed"
    481     },
    482     {
    483       "title": "Adapted large language models can outperform medical experts in clinical text summarization",
    484       "relevance": "Related work establishing MLLM human-level performance on clinical text; contextualizes Turing test findings"
    485     }
    486   ],
    487   "engagement_factors": {
    488     "practical_relevance": {
    489       "score": 3,
    490       "justification": "Directly addresses clinical radiology workflow bottleneck; releases model weights and code; proposes a transferable evaluation framework (FORTE) for other imaging modalities."
    491     },
    492     "surprise_contrarian": {
    493       "score": 2,
    494       "justification": "Counterintuitive finding that negation removal (discarding negative descriptions) dramatically improves clinical accuracy; challenges NLP community's standard evaluation metrics."
    495     },
    496     "fear_safety": {
    497       "score": 1,
    498       "justification": "Mild concern: AI-generated radiology reports indistinguishable from human reports by most physicians, with 'interpretation spree' hallucination-adjacent failures; limited safety framing in the paper."
    499     },
    500     "drama_conflict": {
    501       "score": 1,
    502       "justification": "Moderate tension with NLP evaluation status quo (BLEU/METEOR as gold standard); no major controversy or retraction risk."
    503     },
    504     "demo_ability": {
    505       "score": 3,
    506       "justification": "Code and model weights released on GitHub; researchers with brain CT data can directly run BrainGPT-keyword and evaluate with FORTE."
    507     },
    508     "brand_recognition": {
    509       "score": 2,
    510       "justification": "Published in Nature Communications; UCLA and Taipei Veterans General Hospital affiliations; builds on Microsoft/Meta/Google foundation models."
    511     }
    512   },
    513   "hn_data": {
    514     "threads": [
    515       {
    516         "hn_id": "39960717",
    517         "title": "Mixture-of-Depths: Dynamically allocating compute in transformers",
    518         "points": 281,
    519         "comments": 83,
    520         "url": "https://news.ycombinator.com/item?id=39960717"
    521       },
    522       {
    523         "hn_id": "39927422",
    524         "title": "Mixture-of-Depths: Dynamically allocating compute in transformer language models",
    525         "points": 5,
    526         "comments": 2,
    527         "url": "https://news.ycombinator.com/item?id=39927422"
    528       },
    529       {
    530         "hn_id": "39932637",
    531         "title": "Mixture-of-Depths: Dynamically allocating compute in transformers",
    532         "points": 4,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=39932637"
    535       },
    536       {
    537         "hn_id": "39940557",
    538         "title": "DeepMind: Mixture-of-Depths: Dynamically allocating compute in transformers",
    539         "points": 2,
    540         "comments": 1,
    541         "url": "https://news.ycombinator.com/item?id=39940557"
    542       },
    543       {
    544         "hn_id": "45534311",
    545         "title": "Bad acronyms in papers are amusing",
    546         "points": 2,
    547         "comments": 0,
    548         "url": "https://news.ycombinator.com/item?id=45534311"
    549       },
    550       {
    551         "hn_id": "40002664",
    552         "title": "An Interview and Survey Study on How Rust Developers Use Unsafe Code",
    553         "points": 2,
    554         "comments": 0,
    555         "url": "https://news.ycombinator.com/item?id=40002664"
    556       },
    557       {
    558         "hn_id": "39949473",
    559         "title": "Dynamically allocating compute in transformer-based language models",
    560         "points": 2,
    561         "comments": 0,
    562         "url": "https://news.ycombinator.com/item?id=39949473"
    563       },
    564       {
    565         "hn_id": "41601432",
    566         "title": "The consistent reasoning paradox of intelligence and optimal trust in AI",
    567         "points": 1,
    568         "comments": 1,
    569         "url": "https://news.ycombinator.com/item?id=41601432"
    570       },
    571       {
    572         "hn_id": "40716550",
    573         "title": "LlamaCare: A Large Medical Language Model for Healthcare Knowledge Sharing",
    574         "points": 1,
    575         "comments": 0,
    576         "url": "https://news.ycombinator.com/item?id=40716550"
    577       },
    578       {
    579         "hn_id": "39008399",
    580         "title": "Image Collage on Arbitrary Shape via Shape-Aware Slicing and Optimization",
    581         "points": 1,
    582         "comments": 0,
    583         "url": "https://news.ycombinator.com/item?id=39008399"
    584       }
    585     ],
    586     "top_points": 281,
    587     "total_points": 301,
    588     "total_comments": 87
    589   }
    590 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs