scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19497B)
      1 {
      2   "paper": {
      3     "title": "Concordance of randomised controlled trials for artificial intelligence interventions with the CONSORT-AI reporting guidelines",
      4     "authors": ["Alexander P. L. Martindale", "Carrie D. Llewellyn", "Richard O. de Visser", "Benjamin Ng", "Victoria Ngai", "Aditya U. Kale", "Lavinia Ferrante di Ruffano", "Robert M. Golub", "Gary S. Collins", "David Moher", "Melissa D. McCradden", "Lauren Oakden-Rayner", "Samantha Cruz Rivera", "Melanie Calvert", "Christopher J. Kelly", "Cecilia S. Lee", "Christopher Yau", "An-Wen Chan", "Pearse A. Keane", "Andrew L. Beam", "Alastair K. Denniston", "Xiaoxuan Liu"],
      5     "year": 2024,
      6     "venue": "Nature Communications",
      7     "doi": "10.1038/s41467-024-45355-3"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No code repository or analysis scripts are provided. The Data availability section states materials are 'available upon request to the corresponding author.'"
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Supplementary Data 1 and 2 are provided with the paper, containing the full list of included RCTs and concordance data. The paper states 'All data used in this study is referenced and publicly available. Supplementary information has been provided.'"
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications are provided. The paper mentions using SPSS Version 25.0, Covidence, and R Statistical Software v4.1.1, but no requirements file or reproducible environment setup."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided. The Methods section describes the methodology but does not provide a reproducible workflow with specific commands or scripts."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "IQRs are reported throughout for concordance values, e.g., 'Median concordance with CONSORT-AI reporting was 90% (IQR 77–94%).' IQR serves as an uncertainty measure for the median."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Spearman's Rank-Order Correlation was used to test whether concordance changed with publication date: 'Spearman's r = −0.21, p = 0.091.' P-values under 0.05 were considered significant."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Concordance differences are reported with enough context: e.g., CONSORT-AI mandated journals had 100% concordance vs 90% (IQR 76–94%) for non-mandated. Spearman's r = −0.21 is an effect size measure."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No power analysis or sample size justification is provided. The 65 RCTs represent all eligible studies found, which is appropriate for a systematic review, but the paper acknowledges 'this exploratory analysis was limited by the small number of studies' without formal justification."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "IQRs are reported for all concordance metrics, providing spread measures. E.g., 'Median sample size across all included RCTs was 186 (IQR 56-654).'"
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares concordance across subgroups: those using CONSORT-AI vs CONSORT 2010 vs no guidelines (Table 2), and journals mandating vs not mandating guidelines (Table 4). Previous systematic reviews are also compared in the Discussion."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares against three prior systematic reviews (refs 84-86) that used CONSORT-AI to evaluate RCTs, noting methodological differences and earlier search dates."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "This is a systematic review, not a system with components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Concordance is reported for all CONSORT-AI items combined, AI-specific items only, and non-AI-specific items separately. Per-item concordance percentages are also provided."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a systematic review assessing reporting completeness; there is no system output to evaluate with human judges."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Not applicable to a systematic review; there is no train/test split."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 3 provides per-item concordance for all 14 AI-specific CONSORT-AI items. Results are also broken down by geography, guideline usage, and journal mandate status."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The Discussion identifies specific poorly reported items (algorithm version at 20%, code accessibility at 42%, protocol access at 31%) and discusses why these items were underreported."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that no significant correlation was found between publication date and concordance (Spearman's r = −0.21, p = 0.091), and that only 3/52 journals mandated CONSORT-AI."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims (90% median concordance, only 10 RCTs explicitly used CONSORT-AI, only 3/52 journals endorsed it, algorithm version and code accessibility poorly reported) are all supported by the Results section and tables."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper implies causal links, e.g., 'This may point towards a higher level of editorial scrutiny in journals which promote better reporting practices.' This is observational data and the association could be confounded by journal quality, author expertise, etc. The paper uses hedging language ('may') but does not discuss specific confounds."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper bounds its scope to English-language RCTs published after September 2020, and acknowledges limitations of its search strategy including potential incomplete retrieval and exclusion of non-English RCTs."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The Discussion considers alternative explanations: high concordance without CONSORT-AI awareness may reflect general clinical trial standards; low algorithm version reporting may reflect commercial considerations; the narrow date range limits correlation analysis."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "This is a systematic review; no AI models are used or evaluated by the authors."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting or LLMs used in this systematic review."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No AI models are used in the conduct of this review."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The PRISMA flow diagram (Fig. 1) documents the full pipeline: 8263 records imported → 5111 after deduplication → 332 full-text review → 65 included, with specific exclusion counts and reasons at each stage."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "There is a substantive limitations discussion within the Discussion section, covering incomplete study retrieval, timing of CONSORT-AI adoption, search term confounding, and exclusion of non-English RCTs."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats are discussed: search terms for AI/ML confound item 1a,b(i) concordance assessment; RCTs may have been submitted before CONSORT-AI was known; trial registry indexing errors could cause missed studies; non-English exclusion biases the sample given the diverse geographical spread."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states it covers only English-language RCTs submitted after September 2020, excludes conference abstracts and protocols, and notes the exploratory correlation analysis was limited by small N and narrow date range."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Supplementary Data 1 contains the full list of included RCTs; Supplementary Data 2 contains concordance data for non-AI-specific items. Further materials are available upon request."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The Methods section describes the search strategy (MEDLINE, Embase, Cochrane, clinical trial registries), search date (19 September 2022), date range (from 9 September 2020), and data extraction procedure with two independent reviewers."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants were recruited. This is a systematic review of published papers."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The PRISMA flow diagram documents each stage with counts: 8263 imported → 3152 duplicates removed → 4779 excluded at screening → 267 excluded at full-text with itemized reasons → 65 included. Data extraction by two independent reviewers with conflict resolution described."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The paper is published in Nature Communications which requires funding and competing interests disclosures. The affiliations and acknowledgments sections are present (though specific funding details would be in the full published version's declarations)."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are fully listed. Notably, several authors (Liu, Denniston, Cruz Rivera, Calvert, Collins, Moher) are authors of the original CONSORT-AI guidelines (ref 8), and this connection is acknowledged in the Methods ('carried out in conjunction with CONSORT-AI authors')."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Several authors developed CONSORT-AI itself (ref 8). They have a reputational interest in CONSORT-AI being adopted and shown to be useful. This potential conflict is not explicitly discussed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is visible in the extracted text. The paper does not include an explicit declaration of financial interests."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This is a systematic review; no pre-trained models are evaluated on benchmarks."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not applicable; no model evaluation on benchmarks."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable; no model evaluation on benchmarks."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "This is a systematic review of published papers, not a study with human participants. However, the review protocol was registered on OSF (doi.org/10.17605/OSF.IO/CRF3Q)."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants; this is a systematic review of published papers."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this systematic review."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The study selection criteria for papers are documented in the Methods section."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants; not an experimental study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants; not an experimental study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this systematic review."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "This is a systematic review; no computational method with inference costs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "This is a systematic review; no significant compute was required."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Median concordance with CONSORT-AI reporting was 90% (IQR 77–94%) across 65 included AI RCTs.",
    286       "evidence": "Results section and Table 2 report this figure based on assessment of all 51 CONSORT-AI items across 65 RCTs.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Only 10 of 65 RCTs (15%) explicitly reported use of CONSORT-AI.",
    291       "evidence": "Results section states this directly, with breakdown in Table 2 showing subgroup concordance.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Algorithm version (item 5(i)) was the most poorly reported AI-specific item at only 20% concordance.",
    296       "evidence": "Table 3 provides per-item concordance for all 14 AI-specific items.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Only 3 of 52 journals explicitly endorsed or mandated CONSORT-AI.",
    301       "evidence": "Results section on journal mandates states this, identifying The Lancet Digital Health, The Lancet Gastroenterology, and Ophthalmology Science.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "There was no significant correlation between date of publication and CONSORT-AI concordance.",
    306       "evidence": "Spearman's r = −0.21, p = 0.091, acknowledged as limited by small N and narrow date range.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "RCTs in journals mandating CONSORT 2010 had higher overall concordance (92%) than those without mandates (82%).",
    311       "evidence": "Table 4 provides the breakdown with IQRs. However, no formal significance test is reported for this comparison.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": ["meta-analysis"],
    316   "key_findings": "This systematic review of 65 AI RCTs published after CONSORT-AI's release found generally high reporting concordance (median 90%), but identified persistent gaps in algorithm version reporting (20%) and code accessibility (42%). Only 10/65 RCTs explicitly used CONSORT-AI and only 3/52 publishing journals mandated it. Journal endorsement of reporting guidelines was associated with higher concordance, suggesting editorial oversight plays a meaningful role in reporting quality.",
    317   "red_flags": [
    318     {
    319       "flag": "Authors evaluated their own guideline",
    320       "detail": "Several authors of this systematic review (Liu, Denniston, Cruz Rivera, Calvert, Collins, Moher) are the developers of CONSORT-AI itself (ref 8). They have a reputational interest in CONSORT-AI being shown to be useful and adopted. This conflict is not explicitly discussed."
    321     },
    322     {
    323       "flag": "No formal comparison tests between subgroups",
    324       "detail": "The paper reports concordance differences between subgroups (e.g., journals mandating vs not mandating guidelines) but does not apply formal statistical tests to these comparisons, only to the time trend. The subgroup differences could be coincidental."
    325     }
    326   ],
    327   "cited_papers": [
    328     {
    329       "title": "Reporting guidelines for clinical trial reports for interventions involving artificial intelligence: the CONSORT-AI extension",
    330       "authors": ["X. Liu", "S. Cruz Rivera", "D. Moher", "M. J. Calvert", "A. K. Denniston"],
    331       "year": 2020,
    332       "doi": "10.1038/s41591-020-1034-x",
    333       "relevance": "The CONSORT-AI reporting guideline itself, directly relevant to AI methodology standards and reporting quality."
    334     },
    335     {
    336       "title": "A comparison of deep learning performance against health-care professionals in detecting diseases from medical imaging: a systematic review and meta-analysis",
    337       "authors": ["X. Liu"],
    338       "year": 2019,
    339       "relevance": "Meta-analysis comparing AI to clinicians in medical imaging, relevant to evaluating AI system claims and methodology."
    340     },
    341     {
    342       "title": "CONSORT 2010 Statement: updated guidelines for reporting parallel group randomised trials",
    343       "authors": ["K. F. Schulz", "D. G. Altman", "D. Moher"],
    344       "year": 2010,
    345       "relevance": "The foundational RCT reporting standard that CONSORT-AI extends, relevant to methodology quality assessment frameworks."
    346     },
    347     {
    348       "title": "Reducing waste from incomplete or unusable reports of biomedical research",
    349       "authors": ["P. Glasziou"],
    350       "year": 2014,
    351       "relevance": "Motivates why reporting quality matters — incomplete reporting leads to research waste, directly relevant to the survey's methodology quality theme."
    352     }
    353   ]
    354 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs