scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20200B)
      1 {
      2   "paper": {
      3     "title": "Assessing data extraction in randomized clinical trials with large language models",
      4     "authors": ["Zuhaer Yisha", "Peng Zou", "Sheng Li", "Lin Zhang", "Linfa Guo", "Aodun Gu", "Guiyong Liu", "Tongzu Liu", "Xiaolong Wang"],
      5     "year": 2026,
      6     "venue": "BMC Medical Research Methodology",
      7     "doi": "10.1186/s12874-025-02729-5"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No source code or repository link is provided. The paper describes prompts and a workflow but does not release code."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The datasets are available in the OSF repository (https://osf.io/8fzps/) as stated in the Data Availability section."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications provided. The paper mentions SPSS 27 and MetaXL 5.3 for analysis but no reproducible environment setup."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided. Prompts are in supplementary tables but there is no guide to replicate the full workflow."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Binomial 95% confidence intervals are reported for all Pacc values (e.g., '94% (95% CI: 85%, 100%)') and for Kappa coefficients in Table 2."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper states 'task performance (Pacc) did not differ significantly between original and modified prompts' but does not report which test was used or p-values for this claim. No formal significance tests are reported for comparisons between ChatGPT-4 and Claude 3 Opus."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Pacc values with baseline context are reported throughout (e.g., '57% (95% CI: 48%, 69%)'), and Cohen's Kappa is reported for reliability. The percentage differences between models and between binary/continuous outcomes provide effect size context."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The sample of 105 trials from two systematic reviews is described but no power analysis or formal justification for this sample size is provided. The limitations section acknowledges the 'relatively small sample.'"
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "I² heterogeneity statistics are reported for pooled estimates (e.g., 'I² = 26%', 'I² = 60%'), and ranges of Pacc across outcomes are provided. Confidence intervals for Kappa are also reported."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Results are compared against a verified gold-standard human-extracted dataset, and contextual comparison to human single-extraction accuracy (~65%) and double-extraction (~75%) from prior literature is provided."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The two LLMs tested (ChatGPT-4 and Claude 3 Opus) were contemporary at the time of data collection (mid-May 2024). The gold standard baseline is appropriate for this task."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The study compares original vs. modified prompts (Tables S2 and S3), effectively testing the contribution of prompt engineering. Results with both prompt versions are generated though only modified prompt results are reported in detail since they did not differ significantly."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are used: Pacc (accuracy), Pinc (incorrect extractions), Pfail (failed extractions), and Cohen's Kappa for test-retest reliability."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The entire evaluation is based on comparison against human-verified gold-standard data from two previously validated systematic reviews."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is not a machine learning training study; it evaluates LLM extraction on a fixed set of trials. No train/test split is applicable."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by outcome type (binary vs. continuous), by task type (group size, event count, mean, SD), and by individual outcome (e.g., adverse effects, dropout rates) in Figures 2-6."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The Discussion section identifies three main failure modes: complex table layouts, nonstandard reporting (median/IQR instead of mean/SD), and missing locator words within PDF texts."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Poor performance on continuous outcomes (Pacc 24%-56%) is prominently reported. The paper honestly acknowledges these failures."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims about high accuracy for binary group sizes (91-94%), moderate for event counts (57-71%), and poor for continuous outcomes (24-56%) are all supported by the results in Figures 2-6."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The Discussion makes causal claims about why performance is poor (e.g., 'models are primarily designed to summarize key findings', sensitivity to prompt variations, lack of biomedical training) without controlled experiments to isolate these factors."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper explicitly states this is a 'proof-of-concept study' with 'preliminary results' from a 'relatively small sample—105 trials from two systematic reviews in a single journal—which may limit generalizability.' The title uses 'assessing' rather than making broad claims."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The Discussion considers multiple alternative explanations for poor performance: complex table layouts, nonstandard reporting, missing locator words, general vs. biomedical training, lack of retrieval capability, and prompt sensitivity."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper says 'ChatGPT-4 (paid ChatGPT Plus subscription)' and 'Claude 3 Opus (paid professional tier)' without specific model version snapshots or API version identifiers. No snapshot dates are provided."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Original and modified prompts are provided in supplementary Tables S2 and S3, with example responses in Figures S1-2."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported. The models were used via 'the standard web interface' without specifying any generation parameters."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. Each prompt was executed independently in a new session via the web interface."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The data pipeline is documented: two systematic reviews selected, gold standard verified through prior reproducibility projects, only primary full-text PDFs included (supplementary materials excluded), exact matching criteria for correctness defined."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "A dedicated 'Limitation' section is present on pages 8-9 discussing four specific limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats discussed: small sample from single journal limiting generalizability, variability in responses to identical prompts, focus on only accuracy and reliability metrics without F1/sensitivity/specificity, and lack of assessment of AI's error-correction capability."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states it is limited to 105 trials from two reviews in a single journal, did not assess F1/sensitivity/specificity/hallucination rate, did not test AI's error-detection capability, and excluded supplementary materials."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Raw data is available in the OSF repository (https://osf.io/8fzps/)."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Data collection is described: mid-May 2024, via standard web interfaces, each prompt in a new session, three runs per prompt, only first output used for Pacc calculation."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; the study uses existing systematic review data and LLM outputs."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is documented: selection of two systematic reviews → verification of gold standard → prompt development → execution via web interface → comparison with gold standard → statistical analysis. Figure 1 shows the workflow."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Funding is disclosed: Health Commission of Hubei Province Scientific Research Project (Grant WJ2019H057) and National Natural Science Foundation of China (Grant No. 82400906)."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All author affiliations are listed. Authors are from universities and hospitals, not from OpenAI or Anthropic, so there is no product-evaluation conflict."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The funders (Hubei Province health commission and NSFC) have no financial interest in LLM performance. The paper explicitly states 'The researchers retained complete autonomy throughout all stages of the study... with no interference from the funding organizations.'"
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "'The authors declare no competing interests' is stated in the Declarations section."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper does not state the training data cutoff dates for ChatGPT-4 or Claude 3 Opus. This is relevant because the RCT papers being extracted from could have been in training data."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether the RCT papers used for evaluation were in the training data of ChatGPT-4 or Claude 3 Opus. The models may have memorized content from these published trials."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The RCTs and systematic reviews used are published papers that were likely in the training data of both LLMs. This contamination risk is not discussed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study; it evaluates LLM extraction against existing verified datasets."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants. The paper states 'Ethics approval and consent to participate: Not applicable.'"
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference costs reported. The paper used paid subscriptions (ChatGPT Plus and Claude professional tier) but does not report per-extraction costs, token usage, or total expenditure."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No compute budget stated. Data collection timing (mid-May 2024) is mentioned but total time or cost for running all prompts is not reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "ChatGPT-4 and Claude 3 Opus achieve high accuracy (91-94%) for extracting group sizes from binary outcome RCTs.",
    286       "evidence": "Figure 2 shows Pacc of 94% (95% CI: 85-100%) and 92% (81-99%) for ChatGPT-4, and 91% (80-98%) for Claude 3 Opus on binary group size extraction across 41 trials.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "LLM accuracy for extracting continuous outcome data (mean and SD) is poor, ranging from 24% to 56%.",
    291       "evidence": "Figures 5-6 show weighted mean Pacc of 30% (ChatGPT-4) and 56% (Claude) for means, and 24% (ChatGPT-4) and 44% (Claude) for standard deviations.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Claude 3 Opus has higher test-retest reliability than ChatGPT-4.",
    296       "evidence": "Table 2 shows Kappa values of 0.84-0.96 for Claude vs. 0.67-0.74 for ChatGPT-4.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "AI-assisted extraction can achieve accuracy 'equal or superior' to manual single-extraction for binary outcomes.",
    301       "evidence": "The paper compares 91-94% Pacc to literature-reported human single-extraction accuracy of ~65%, but this comparison is to external literature figures rather than a head-to-head experiment.",
    302       "supported": "moderate"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "ChatGPT-4 and Claude 3 Opus can extract binary outcome data (group sizes, event counts) from RCT reports with moderate-to-high accuracy (57-94%), but performance on continuous outcomes (means, standard deviations) is poor (24-56%). Claude 3 Opus shows substantially higher test-retest reliability than ChatGPT-4 (Kappa 0.84-0.96 vs. 0.67-0.74). The study is a proof-of-concept on 105 trials from two urology systematic reviews, and the authors appropriately caution against generalizing these preliminary findings.",
    307   "red_flags": [
    308     {
    309       "flag": "Contamination risk unaddressed",
    310       "detail": "The RCT papers used for evaluation are published articles that were likely in the training data of both LLMs. If the models memorized content from these papers, reported accuracy may overestimate real-world performance on unseen documents. This is never discussed."
    311     },
    312     {
    313       "flag": "No model version specificity",
    314       "detail": "Models are identified only as 'ChatGPT-4 (paid version)' and 'Claude 3 Opus' via web interface without snapshot dates or version identifiers. Web interface behavior changes over time, making results non-reproducible."
    315     },
    316     {
    317       "flag": "No hyperparameter control",
    318       "detail": "Using the web interface means no control over temperature, sampling, or other generation parameters. This could explain some of the test-retest variability observed."
    319     }
    320   ],
    321   "cited_papers": [
    322     {
    323       "title": "Data extraction for evidence synthesis using a large Language model: A proof-of-concept study",
    324       "authors": ["Gartlehner G", "Kahwati L", "Hilscher R"],
    325       "relevance": "Directly related proof-of-concept study on LLM-based data extraction for evidence synthesis.",
    326       "doi": "10.1002/jrsm.1710"
    327     },
    328     {
    329       "title": "Large Language models encode clinical knowledge",
    330       "authors": ["Singhal K", "Azizi S", "Tu T"],
    331       "year": 2023,
    332       "relevance": "Foundational work on LLM capabilities in biomedical domains.",
    333       "doi": "10.1038/s41586-023-06291-2"
    334     },
    335     {
    336       "title": "OpenMedLM: prompt engineering can out-perform fine-tuning in medical question-answering with open-source large Language models",
    337       "authors": ["Maharjan J", "Garikipati A", "Singh NP"],
    338       "year": 2024,
    339       "relevance": "Examines prompt engineering vs. fine-tuning for LLM performance in medical contexts."
    340     },
    341     {
    342       "title": "Exploring the use of a large Language model for data extraction in systematic reviews: a rapid feasibility study",
    343       "authors": ["Schmidt L", "Hair K", "Graziozi S"],
    344       "year": 2024,
    345       "arxiv_id": "2405.14445",
    346       "relevance": "Related feasibility study on LLM data extraction for systematic reviews."
    347     },
    348     {
    349       "title": "Automatically extracting numerical results from randomized controlled trials with large Language models",
    350       "authors": ["Yun HS", "Pogrebitskiy D", "Marshall IJ"],
    351       "year": 2024,
    352       "relevance": "Directly comparable study on automated numerical extraction from RCTs using LLMs."
    353     },
    354     {
    355       "title": "From promise to practice: challenges and pitfalls in the evaluation of large Language models for data extraction in evidence synthesis",
    356       "authors": ["Gartlehner G", "Kahwati L", "Nussbaumer-Streit B"],
    357       "year": 2024,
    358       "relevance": "Discusses challenges in evaluating LLMs for evidence synthesis data extraction.",
    359       "doi": "10.1136/bmjebm-2024-113199"
    360     },
    361     {
    362       "title": "Opportunities, challenges and risks of using artificial intelligence for evidence synthesis",
    363       "authors": ["Siemens W", "von Elm E", "Binder H"],
    364       "year": 2025,
    365       "relevance": "Broad assessment of AI risks and opportunities in evidence synthesis workflows.",
    366       "doi": "10.1136/bmjebm-2024-113320"
    367     },
    368     {
    369       "title": "A Prompt Pattern Catalog to Enhance Prompt Engineering with ChatGPT",
    370       "authors": ["White J", "Fu Q", "Hays S"],
    371       "year": 2023,
    372       "relevance": "Prompt engineering methodology relevant to LLM-based tool evaluation."
    373     }
    374   ]
    375 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs