scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23163B)
      1 {
      2   "paper": {
      3     "title": "Comparing code-free and bespoke deep learning approaches in ophthalmology",
      4     "authors": [
      5       "Carolyn Yu Tung Wong",
      6       "Ciara O'Byrne",
      7       "Priyal Taribagil",
      8       "Timing Liu",
      9       "Fares Antaki",
     10       "Pearse Andrew Keane"
     11     ],
     12     "year": 2024,
     13     "venue": "Graefe's Archive for Clinical and Experimental Ophthalmology",
     14     "doi": "10.1007/s00417-024-06432-x"
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "No code repository, analysis scripts, or supplementary code are mentioned anywhere in the paper. As a review paper, analysis or data extraction scripts could have been released but were not."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No dataset or structured extraction data is released. The review relies on published literature but does not release its own extracted data tables or search results in a reusable format."
     27       },
     28       "environment_specified": {
     29         "applies": false,
     30         "answer": false,
     31         "justification": "This is a narrative mini review paper with no computational experiments. There is no software environment to specify."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No step-by-step reproduction instructions are provided. The search methodology is briefly described (PubMed search with keywords 'autoML' AND 'ophthalmology') but there are no detailed instructions for reproducing the review process."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": false,
     42         "answer": false,
     43         "justification": "This is a narrative review that does not conduct any statistical analyses. It reports performance metrics from reviewed studies but does not perform its own statistical computations."
     44       },
     45       "significance_tests": {
     46         "applies": false,
     47         "answer": false,
     48         "justification": "No statistical significance tests are applicable; this is a narrative review comparing studies qualitatively, not performing meta-analysis."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": false,
     52         "answer": false,
     53         "justification": "No effect sizes are computed by the authors. The paper reports raw performance metrics from the reviewed studies but does not calculate any pooled effect sizes."
     54       },
     55       "sample_size_justified": {
     56         "applies": false,
     57         "answer": false,
     58         "justification": "No statistical experiments are conducted by the authors. Sample size justification is not applicable to this narrative review."
     59       },
     60       "variance_reported": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "No experimental runs are performed. This is a narrative review, so variance across runs is not applicable."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "The review does not compare itself against prior reviews or surveys on the same topic. It does compare CFDL vs bespoke DL studies, but this is the content of the review, not a baseline comparison for the review itself as a methodology."
     71       },
     72       "baselines_contemporary": {
     73         "applies": false,
     74         "answer": false,
     75         "justification": "As a narrative mini review, there is no experimental evaluation with baselines. The question of contemporary baselines does not apply."
     76       },
     77       "ablation_study": {
     78         "applies": false,
     79         "answer": false,
     80         "justification": "No system or method with components to ablate. This is a review paper."
     81       },
     82       "multiple_metrics": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No experiments are conducted by the authors. This is a narrative review."
     86       },
     87       "human_evaluation": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "No system outputs are produced that would require human evaluation. This is a review paper."
     91       },
     92       "held_out_test_set": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "No experiments are conducted. This is a narrative review paper."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The review is organized by task category (DR screening, retinal multi-disease classification, surgical video classification, oculomics, resource management) with detailed per-task tables (Tables 1-5) showing breakdowns of each study's performance, limitations, and findings."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper discusses limitations and failure modes of reviewed studies throughout, including lack of external validation, small datasets, class imbalance, limited generalizability, and black-box nature. The Limitations section explicitly discusses biases and challenges."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper reports negative findings: 'discussions of CFDL were mostly done mono-dimensionally' and 'positive conclusions drawn on CFDL's benefits were largely based on the system-derived performance results' without holistic comparison. It also reports performance drops during external validation (e.g., Korot et al.'s accuracy dropped from 86.5% to 78.6%)."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims that 'studies were optimistic towards CFDL's advantages over bespoke DL' but discussions were 'mono-dimensional and had wide applicability gaps.' Both claims are supported by the detailed per-task analyses in the Results and Discussion sections."
    118       },
    119       "causal_claims_justified": {
    120         "applies": false,
    121         "answer": false,
    122         "justification": "The paper does not make causal claims. It is a narrative review that reports findings from existing studies and argues for a multidimensional evaluation framework, without claiming causal relationships."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper explicitly bounds its scope to ophthalmological tasks and notes limitations: 'Our model-to-model comparison per task is subject to biases because of the different datasets used' and acknowledges the review covers only 10 studies across 5 specific tasks."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The Discussion section considers that apparent CFDL advantages may be artifacts of different datasets, lack of external validation, and mono-dimensional evaluation. The Limitations section discusses dataset biases, single-centre designs, and small dataset sizes as alternative explanations for reported performance."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": false,
    138         "answer": false,
    139         "justification": "No AI models are used by the authors. This is a review paper that discusses models used in the reviewed studies."
    140       },
    141       "prompts_provided": {
    142         "applies": false,
    143         "answer": false,
    144         "justification": "No prompting is used. This is a review paper."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": false,
    148         "answer": false,
    149         "justification": "No experiments are conducted by the authors. This is a review paper."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. This is a review paper."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "The search process is described at a high level (PubMed search with 'autoML' AND 'ophthalmology', then subsequent search for equivalent DL studies), with a flow diagram in Fig. 1. However, the actual filtering criteria beyond language and article type are vague. No clear criteria for how 'equivalent DL studies' were identified for each task. The paper says 'We identified ten relevant studies' without specifying how many were initially found or the screening criteria at each stage."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "A dedicated 'Limitations' section is present near the end of the paper, discussing biases from different datasets, lack of external validation, black-box challenges, and single-centre study designs."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "The Limitations section discusses specific threats: biases from different datasets used for CFDL vs bespoke models, lack of routine external validation across reviewed studies, uncertain effectiveness of external validation against adversarial attacks, single-centre observational study designs, and small dataset sizes with class imbalance."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The paper does not explicitly state what is outside its scope. It focuses on 5 ophthalmological tasks but does not explicitly list what tasks or aspects it did not cover. The limitations discuss biases but do not draw clear boundaries around what claims are and are not being made."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No raw data, extracted metrics, or search results are made available for independent verification. The structured comparison data in the tables is not released in a machine-readable format."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "The Methods section describes the search process: MEDLINE/PubMed search on June 25, 2023, using keywords 'autoML' AND 'ophthalmology', followed by a second search for equivalent DL studies. Inclusion criteria (English, full text available) and exclusion criteria (reviews, editorials, protocols, case reports/series) are stated."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No human participants are involved. This is a review of published literature."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "The paper briefly describes the search strategy and references Fig. 1 for the search process, but the pipeline from initial search results to the final 10 studies is not documented with counts at each stage. There is no PRISMA-style reporting of how many studies were found initially, screened, and excluded at each step."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding information or acknowledgments section listing grants or sponsors is provided in the paper."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are clearly listed: Institute of Ophthalmology (UCL), Moorfields Eye Hospital NHS Foundation Trust, Chinese University of Hong Kong, CHUM School of AI in Healthcare, and NIHR Moorfields Biomedical Research Centre. Notably, author Fares Antaki is co-author on some of the reviewed CFDL studies (Antaki et al. [18])."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding is disclosed, so independence of funders cannot be assessed. The absence of a funding disclosure means this criterion is not satisfied."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "The Declarations section states: 'The authors declare no competing interests.' A competing interests statement is present."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "This is a review paper that does not evaluate any pre-trained model on a benchmark. Contamination criteria do not apply."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "This is a review paper, not a benchmark evaluation. Contamination criteria do not apply."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "This is a review paper, not a benchmark evaluation. Contamination criteria do not apply."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants are involved. This is a review of published studies."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants. The paper explicitly states: 'This article does not contain any studies with human participants or animals performed by any of the authors.'"
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this review."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this review."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this review."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this review."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this review."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "This is a survey/review paper. It discusses costs of CFDL in general (e.g., 'processing up to 35,000 images with less than US$100') but does not have its own method with inference costs."
    282       },
    283       "compute_budget_stated": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "This is a survey/review paper with no computational experiments."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "CFDL has shown comparable performance to bespoke DL models across five ophthalmological tasks (DR screening, multi-retinal disease classification, surgical video classification, oculomics, resource management).",
    293       "evidence": "Tables 1-5 and the Results section present per-task comparisons. For example, the CFDL DR screening model achieved ACC 97% and F1 96%, while bespoke DL achieved AUROC 0.99 and SN 93.86%. The CFDL surgical video classifier achieved ACC 96% vs. bespoke DL's 84%.",
    294       "supported": "moderate"
    295     },
    296     {
    297       "claim": "Discussions of CFDL's advantages over bespoke DL in the reviewed literature are largely mono-dimensional, focusing on performance metrics without considering implementation factors like patient acceptance and cost-effectiveness.",
    298       "evidence": "The Discussion section states: 'positive conclusions drawn on CFDL's benefits were largely based on the system-derived performance results' and 'discussions of CFDL were mostly done mono-dimensionally, seldomly discussing other implementation demands of AI.' This is supported by the observation that none of the 5 CFDL studies included patient attitude information (p. 2794).",
    299       "supported": "strong"
    300     },
    301     {
    302       "claim": "CFDL and bespoke DL are unique in their assets and irreplaceable with each other; their benefits are differentially valued on a case-to-case basis.",
    303       "evidence": "The Conclusion states this directly, supported by the per-task discussion showing CFDL advantages in some contexts (low-cost prototyping, resource management) and bespoke DL advantages in others (interpretability for diagnostic tasks). However, this is based on a narrative comparison of only 10 studies.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "External validation has not been routinely performed across the reviewed studies, making claims of model robustness liable to biases.",
    308       "evidence": "The Limitations section notes this, and the tables confirm: Nunez et al. (Table 1), Abitbol et al. (Table 2), Yeh et al. (Table 3), Munk et al. (Table 4), and both resource management studies (Table 5) had 'NR' (not reported) for externally validated performance metrics.",
    309       "supported": "strong"
    310     }
    311   ],
    312   "methodology_tags": [
    313     "meta-analysis",
    314     "qualitative"
    315   ],
    316   "key_findings": "This mini review compares code-free deep learning (CFDL) and bespoke deep learning across five ophthalmological tasks (DR screening, retinal disease classification, surgical video classification, oculomics, and resource management), finding that CFDL shows comparable performance to bespoke DL in the reviewed studies. The authors argue that existing evaluations are mono-dimensional, focusing only on model performance metrics while neglecting implementation considerations such as developer intent, patient acceptance, cost-effectiveness, and model interpretability. The paper concludes that CFDL and bespoke DL serve complementary roles and that future work should adopt a multidimensional assessment framework.",
    317   "red_flags": [
    318     {
    319       "flag": "Author conflict of interest not acknowledged",
    320       "detail": "Fares Antaki, a co-author of this review, is also a co-author of Antaki et al. [18], one of the 5 CFDL studies being reviewed. This conflict is not disclosed or discussed, despite the review assessing CFDL's promise. The review generally presents CFDL favorably."
    321     },
    322     {
    323       "flag": "Very narrow search strategy",
    324       "detail": "The literature search used only 'autoML' AND 'ophthalmology' in PubMed, which is extremely narrow and likely missed relevant CFDL studies that use different terminology (e.g., 'no-code deep learning', 'automated deep learning', 'transfer learning platform'). Only 10 studies were included."
    325     },
    326     {
    327       "flag": "Non-systematic review presented with systematic language",
    328       "detail": "The paper describes a search strategy and uses tables to structure its comparison, but lacks key systematic review elements: no PRISMA checklist, no risk-of-bias assessment of included studies, no protocol registration, and no documentation of study counts at each screening stage."
    329     },
    330     {
    331       "flag": "Comparisons across different datasets",
    332       "detail": "The review compares CFDL and bespoke DL models that were trained on different datasets (acknowledged in Limitations), making performance comparisons unreliable. This is a fundamental methodological limitation that undermines the paper's central comparison."
    333     },
    334     {
    335       "flag": "No quality assessment of reviewed studies",
    336       "detail": "The review does not apply any quality assessment tool (e.g., PROBAST, QUADAS-2) to the included studies. Without structured quality assessment, the review may be laundering weak methodology from its source studies."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Automated deep learning in ophthalmology: AI that can build AI",
    342       "authors": ["C. O'Byrne", "A. Abbas", "E. Korot", "P.A. Keane"],
    343       "year": 2021,
    344       "relevance": "Directly relevant review of code-free/automated deep learning approaches in ophthalmology, establishing the CFDL concept evaluated in this paper."
    345     },
    346     {
    347       "title": "Automated machine learning: review of the state-of-the-art and opportunities for healthcare",
    348       "authors": ["J. Waring", "C. Lindvall", "R. Umeton"],
    349       "year": 2020,
    350       "relevance": "Review of automated machine learning (AutoML) in healthcare, establishing the broader context for code-free AI development tools."
    351     },
    352     {
    353       "title": "Code-free deep learning for multi-modality medical image classification",
    354       "authors": ["E. Korot", "Z. Guan", "D. Ferraz"],
    355       "year": 2021,
    356       "relevance": "Demonstrates code-free deep learning can process large image datasets cheaply (35,000 images for <$100), establishing cost-effectiveness claims for CFDL."
    357     },
    358     {
    359       "title": "Evaluating an automated machine learning model that predicts visual acuity outcomes in patients with neovascular age-related macular degeneration",
    360       "authors": ["A. Abbas", "C. O'Byrne", "D.J. Fu"],
    361       "year": 2022,
    362       "relevance": "Empirical evaluation of AutoML model performance in ophthalmology, directly relevant to the survey's scope of comparing automated vs bespoke approaches."
    363     },
    364     {
    365       "title": "Development and international validation of custom-engineered and code-free deep-learning models for detection of plus disease in retinopathy of prematurity: a retrospective study",
    366       "authors": ["S.K. Wagner", "B. Liefers", "M. Radia"],
    367       "year": 2023,
    368       "relevance": "International validation study comparing code-free and custom DL models, providing evidence on CFDL matching bespoke DL performance."
    369     },
    370     {
    371       "title": "Democratizing artificial intelligence: how no-code AI can leverage machine learning operations",
    372       "authors": ["L. Sundberg", "J. Holmström"],
    373       "year": 2023,
    374       "relevance": "Examines the democratization of AI through no-code platforms, relevant to the broader theme of making AI accessible to non-programmers."
    375     },
    376     {
    377       "title": "The importance of being external. Methodological insights for the external validation of machine learning models in medicine",
    378       "authors": ["F. Cabitza", "A. Campagner", "F. Soares"],
    379       "year": 2021,
    380       "relevance": "Discusses methodology for external validation of ML models in medicine, directly relevant to the paper's emphasis on validation gaps in CFDL studies."
    381     },
    382     {
    383       "title": "Cost-effectiveness of artificial intelligence as a decision-support system applied to the detection and grading of melanoma, dental caries, and diabetic retinopathy",
    384       "authors": ["J. Gomez Rossi", "N. Rojas-Perilla", "J. Krois", "F. Schwendicke"],
    385       "year": 2022,
    386       "relevance": "Cost-effectiveness analysis of AI decision-support systems relevant to the paper's argument that CFDL evaluations should consider cost-effectiveness."
    387     }
    388   ]
    389 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs