scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21485B)
      1 {
      2   "paper": {
      3     "title": "Cocreating an Automated mHealth Apps Systematic Review Process With Generative AI: Design Science Research Approach",
      4     "authors": ["Guido Giunti", "Colin P Doherty"],
      5     "year": 2024,
      6     "venue": "JMIR Medical Education",
      7     "doi": "10.2196/48949"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "A GitHub repository is referenced: https://github.com/guidogiunti/ChatGPT-SR-script (reference [32]). The working script version is stated to be available there."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper does not release the CSV output files, the keywords.csv file, or the comparison data from prior systematic reviews. Only the script itself is on GitHub."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions Python and several libraries (requests, bs4) but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The cocreation process is described narratively, but there is no reproducibility section explaining how to replicate the study."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "This is a qualitative case study / DSR paper with no quantitative experimental results that would require confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "The paper makes no comparative claims based on quantitative data that would require significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No quantitative effects are measured. The paper is a qualitative feasibility study."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "This is a single-case DSR study, not a quantitative experiment with a sample requiring justification."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No repeated experimental runs are conducted. The cocreation process was performed once by a single author."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares the ChatGPT-generated script output to the output from the prior 42matters-based script used in background studies. The Evaluation section notes differences in the CSV output columns."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The baseline is the authors' own prior script using 42matters, described as 'no longer functional.' No comparison to contemporary AI-assisted systematic review tools or alternative LLM approaches is made."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "The system is a single cocreation process without separable components that could be ablated."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The evaluation mentions time (4 hours 39 minutes) and a qualitative comparison of output columns, but no formal metrics are defined or reported systematically."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Although the authors qualitatively assessed the output, there is no structured human evaluation with defined criteria, multiple evaluators, or inter-rater reliability."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is not a benchmark evaluation. There is no test set to hold out."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The evaluation does not break down results by category (e.g., types of errors, types of apps found). Only aggregate observations are provided."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses multiple failures: code errors (ModuleNotFoundError), timeout issues, PubMed search problems, missing metadata columns, and the script's reliance on exact title matching for PubMed. The output was deemed 'unsuitable as a final output.'"
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that the ChatGPT-generated script produced an output that was 'useful as an intermediate outcome but was deemed unsuitable as a final output,' and that the Medical Evidence column only contained 'Yes' or 'No' based on title matching, missing important metadata."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims the study 'demonstrates the potential of using generative AI to automate the time-consuming process' and notes limitations. The paper provides a narrative account supporting these hedged claims."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper states 'this approach could be particularly useful for researchers with limited coding skills' and implies the cocreation process led to a functional script. These are causal-adjacent claims, but the study design (single case, single user, no controls) is inadequate for causal inference."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The abstract and conclusions generalize to 'researchers with limited coding skills' and 'the time-consuming process of conducting systematic reviews' broadly, but the study tested only one researcher, one topic (MS mHealth apps), one AI (ChatGPT 3.5), and one platform (Google Play Store). The title says 'automated mHealth apps systematic review process' which overstates what was achieved."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for the results. For example, the author's actual coding experience (rated 4-8 across various skills) may have substantially aided the process, but this is not examined as a confound."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper states 'ChatGPT 3.5, as of June 2023' but does not specify the exact model version (e.g., gpt-3.5-turbo-0613). 'ChatGPT 3.5' is a marketing name, not a versioned API model."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper includes several actual prompts used during the cocreation process, including the triggering prompt (full text quoted) and several follow-up prompts. While not every prompt is included, the key prompts are provided verbatim."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No API hyperparameters are reported (temperature, top-p, max tokens). The interaction appears to have been through the ChatGPT web interface, but no settings are mentioned."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The interaction is a simple conversational exchange through the ChatGPT web interface."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper does not clearly document how the background studies' data was preprocessed for comparison, or how the keywords.csv file was constructed."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "A dedicated 'Limitations' section is present, discussing DSR methodology limitations, subjectivity bias, and data quality issues."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The limitations are mostly generic to DSR methodology ('subjectivity is a common bias present in DSR'). The most specific limitation mentioned is about the quality of search results from training data, but this is still relatively vague. No threats specific to this particular study's findings are discussed."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. There are no explicit scope boundaries like 'this does not demonstrate that non-coders could achieve the same results' or 'this does not apply to systematic reviews beyond mHealth apps.'"
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The full chat transcript with ChatGPT is not available. The CSV output files are not released. Only the final script is on GitHub. The reader cannot independently verify the cocreation process."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The data collection process (cocreation with ChatGPT) is described in the Results section with specific prompts, responses, and the iterative debugging process. The background studies used as comparison points are also referenced."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants were recruited. The study involved a single author interacting with ChatGPT."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The pipeline from ChatGPT interaction to final output is described narratively but lacks detail on intermediate steps. For example, how many iterations occurred total, how the comparison to prior study outputs was structured, and what specific criteria were used for the evaluation are not fully documented."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Funding is disclosed in the Acknowledgments section: EU Horizon 2020 Marie Skłodowska-Curie grant (101034252), Science Foundation Ireland grant (16/RC/3948), and Business Finland's More Stamina Research to Business project."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Trinity College Dublin, University of Oulu, FutureNeuro SFI Research Centre, and St James Hospital."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The funders (EU Horizon 2020, SFI, Business Finland) are public research funding agencies that do not have a financial interest in whether ChatGPT is useful for systematic reviews."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "A 'Conflicts of Interest' section is present and states 'None declared.'"
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It uses ChatGPT as a code generation assistant in a qualitative case study."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No benchmark evaluation is conducted. The paper is a qualitative case study of code cocreation."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No benchmark is used. This is a qualitative case study."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants were involved. The paper explicitly states 'No ethics board review is needed as the work does not use patient data or involve human participants.'"
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants. The Ethical Considerations section explicitly states ethics review is not needed."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the study. The single author who conducted the cocreation is described via Table 1 (skills background), but this is not a participant demographics report."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants were recruited."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants and no experimental conditions to randomize."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants and no experimental conditions requiring blinding."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper reports the total time (4 hours 39 minutes) but does not report any monetary cost, token usage, or number of API calls/messages exchanged with ChatGPT."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No computational budget is stated. The hardware used, API costs, or total compute resources are not reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Generative AI (ChatGPT 3.5) can be used to cocreate an automated script for systematic reviews of mHealth apps.",
    286       "evidence": "The Results section describes the full cocreation process, resulting in a working Python script that crawls the Google Play Store and cross-references with PubMed. The script is available on GitHub (reference [32]).",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "The cocreation process took 4 hours and 39 minutes total.",
    291       "evidence": "Stated in the Evaluation section and in the Highlights: 'The overall cocreation process exercise had a total duration of 4 hours and 39 minutes.'",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "No knowledge of Python scripting was required by the author.",
    296       "evidence": "Listed in Highlights section. However, Table 1 shows the author has intermediate-to-advanced skills in multiple programming-adjacent domains (JavaScript beginner, PHP beginner, HTML5 intermediate), undermining this claim. The process was conducted 'as if no coding skill was present,' but the author rated himself 4-7 on relevant coding skills.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "The output was useful as an intermediate outcome but insufficient to replace manual systematic review steps.",
    301       "evidence": "The Evaluation section states the CSV lacked metadata columns and the Medical Evidence column used only title matching in PubMed. The output 'was deemed unsuitable as a final output.'",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "This approach could be particularly useful for researchers with limited coding skills.",
    306       "evidence": "Stated in the abstract and conclusions, but supported only by a single case study involving a researcher who actually has some coding skills. No evidence from actual non-coders is presented.",
    307       "supported": "weak"
    308     }
    309   ],
    310   "methodology_tags": ["case-study", "qualitative"],
    311   "key_findings": "A physician used ChatGPT 3.5 to co-create a Python script that automates part of a systematic review of mHealth apps, completing the process in about 4.5 hours. The resulting script could crawl the Google Play Store and cross-reference apps with PubMed, but the output was deemed insufficient as a final product, requiring further refinement. The paper provides a useful narrative account of the cocreation process including error handling and debugging, but the evaluation is minimal and the generalizability claims are not well-supported by a single-case study.",
    312   "red_flags": [
    313     {
    314       "flag": "N=1 case study with broad generalization claims",
    315       "detail": "The entire study is based on a single researcher's interaction with a single AI tool on a single topic. The conclusions generalize to 'researchers with limited coding skills' and systematic reviews broadly, far exceeding what a single case study can support."
    316     },
    317     {
    318       "flag": "Author's coding skills undermine 'no coding required' claim",
    319       "detail": "Table 1 shows the author rates himself 4-8 on various programming-related skills. The study was conducted 'as if no coding skill was present,' but the author's actual experience likely influenced prompting quality and debugging ability. This is not examined as a confound."
    320     },
    321     {
    322       "flag": "Minimal evaluation with no formal metrics",
    323       "detail": "The evaluation consists of a qualitative comparison to a prior script's output with no formal metrics, no precision/recall analysis of the app search results, and no comparison to alternative approaches."
    324     },
    325     {
    326       "flag": "No chat transcript or detailed interaction log released",
    327       "detail": "While some prompts are included in the paper, the full interaction log with ChatGPT is not available, making it impossible to fully verify or reproduce the cocreation process."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Better together? An evaluation of AI-supported code translation",
    333       "authors": ["J Weisz", "M Muller", "S Ross", "F Martinez", "S Houde", "M Agarwal"],
    334       "year": 2022,
    335       "doi": "10.1145/3490099.3511157",
    336       "relevance": "Evaluates AI-supported code translation, relevant to understanding AI-assisted programming productivity."
    337     },
    338     {
    339       "title": "ChatGPT is fun, but not an author",
    340       "authors": ["HH Thorp"],
    341       "year": 2023,
    342       "doi": "10.1126/science.adg7879",
    343       "relevance": "Discusses ethical implications of generative AI in scientific authorship, relevant to AI in research methodology."
    344     },
    345     {
    346       "title": "How does ChatGPT perform on the United States Medical Licensing Examination? The implications of large language models for medical education and knowledge assessment",
    347       "authors": ["A Gilson", "CW Safranek", "T Huang"],
    348       "year": 2023,
    349       "doi": "10.2196/45312",
    350       "relevance": "Evaluates ChatGPT's capabilities on medical exams, relevant to LLM capability assessment methodology."
    351     },
    352     {
    353       "title": "Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models",
    354       "authors": ["TH Kung", "M Cheatham", "A Medenilla"],
    355       "year": 2023,
    356       "doi": "10.1371/journal.pdig.0000198",
    357       "relevance": "Another evaluation of ChatGPT on medical licensing exams, relevant to LLM benchmark evaluation methodology."
    358     },
    359     {
    360       "title": "The role of ChatGPT, generative language models, and artificial intelligence in medical education: a conversation with ChatGPT",
    361       "authors": ["G Eysenbach"],
    362       "year": 2023,
    363       "doi": "10.2196/46885",
    364       "relevance": "Explores generative AI's role in medical education, relevant to understanding AI-assisted processes in research."
    365     },
    366     {
    367       "title": "A full systematic review was completed in 2 weeks using automation tools: a case study",
    368       "authors": ["J Clark", "P Glasziou", "C Del Mar", "A Bannach-Brown", "P Stehlik", "AM Scott"],
    369       "year": 2020,
    370       "doi": "10.1016/j.jclinepi.2020.01.008",
    371       "relevance": "Prior work on automating systematic reviews, directly relevant to AI-assisted research methodology."
    372     },
    373     {
    374       "title": "Reducing systematic review burden using Deduklick: a novel, automated, reliable, and explainable deduplication algorithm to foster medical research",
    375       "authors": ["N Borissov", "Q Haas", "B Minder"],
    376       "year": 2022,
    377       "doi": "10.1186/s13643-022-02045-9",
    378       "relevance": "AI-based automation of systematic review deduplication, relevant to AI-assisted research tools."
    379     }
    380   ]
    381 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs