scan.json (25655B)
1 { 2 "paper": { 3 "title": "Synergizing Human Expertise and AI Efficiency with Language Model for Microscopy Operation and Automated Experiment Design", 4 "authors": [ 5 "Yongtao Liu", 6 "Marti Checa", 7 "Rama K. Vasudevan" 8 ], 9 "year": 2024, 10 "venue": "Machine Learning: Science and Technology", 11 "arxiv_id": "2401.13803", 12 "doi": "10.1088/2632-2153/ad52e9" 13 }, 14 "scan_version": 3, 15 "active_modules": [], 16 "methodology_tags": ["case-study", "qualitative"], 17 "key_findings": "ChatGPT4, when fed an API guideline document (AEcroscoPy), can convert natural-language experimental descriptions into executable Python scripts for scanning probe microscopy operations, including multi-step workflows and custom scan trajectories. The LLM can also summarize experiment logs and assist in reproducing experiments from literature. However, the paper finds that GPT4's scientific interpretation capability is very limited, and it cannot design advanced experiments from domain literature without significant human guidance.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "The paper references the AEcroscoPy package (ref 43) but does not provide a repository URL or archive for the code, prompts, or scripts used in this study. The ChatGPT4 conversation logs and generated scripts are shown only in figures and supplementary materials within the paper itself." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "No dataset is released. The microscopy data (BEPFM HDF files, PFM images) and ChatGPT4 conversation logs used in the study are not made publicly available." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "No environment specifications are provided. The paper does not list library versions, Python version, or any dependency information for reproducing the experiments." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are included. A reader would need to guess how to set up the ChatGPT4 sessions, what guideline document to feed, and how to replicate the specific queries." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper reports no quantitative metrics at all. All evaluations of LLM performance are qualitative descriptions (e.g., 'correctly suggested,' 'successfully accomplished'). No numerical results with uncertainty are provided." 46 }, 47 "significance_tests": { 48 "applies": false, 49 "answer": false, 50 "justification": "The paper makes no quantitative comparative claims between systems. All evaluations are qualitative demonstrations of a single system (ChatGPT4 + AEcroscoPy), so significance tests are structurally inapplicable." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": false, 55 "justification": "No effect sizes or quantitative measurements of performance are reported. The paper could have measured success rates, time savings, or code correctness rates but did not." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "The paper tests a small number of tasks (approximately 8-10 distinct queries across different categories) with no justification for why this number is sufficient to support its claims about LLM utility." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "No repeated trials are conducted. Each task is tested once. There is no assessment of whether the LLM would produce consistent outputs across multiple runs of the same query." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": false, 72 "justification": "No baselines are included. The paper does not compare ChatGPT4 against other LLMs, against non-LLM code generation approaches, or against human-only programming time. The evaluation consists solely of the authors' subjective assessment of ChatGPT4 outputs." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": false, 77 "justification": "No baselines of any kind are included, so contemporariness cannot be assessed." 78 }, 79 "ablation_study": { 80 "applies": false, 81 "answer": false, 82 "justification": "The system is a single component (ChatGPT4 prompted with API documentation). There are no modular components to ablate." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": false, 87 "justification": "No quantitative metrics are used at all. Evaluation is entirely qualitative — the authors inspect generated code and describe whether it is 'correct,' 'insightful,' or 'reasonable.'" 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "While the authors informally assess the LLM outputs, there is no formal human evaluation with defined criteria, inter-rater reliability, or structured assessment protocol." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": false, 97 "justification": "There is no test set concept. Tasks are ad-hoc demonstrations chosen by the authors." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "The paper organizes results by task category: individual command conversion, multifaceted workflow programming, experiment summarization/reproduction, data analysis, and scientific interpretation. Each category is discussed separately with distinct examples." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper discusses several failure cases: the spiral scan with DC voltage where AE-GPT 'erroneously includes the tip voltage parameters directly into the spiral function'; the 'Necessity of Science-LLM for Scientific Interpretation' section noting that 'AE-GPT's ability in scientific interpretation is very limited'; and Supplemental Note-II showing designed experiments 'do not harness knowledge from the provided literatures.'" 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper reports that GPT4 'suffers from inability to extend beyond basic analyses or more in-depth technical experimental design,' that scientific interpretation is 'mostly just the definition of certain scientific concept,' and that advanced experiment design from literature fails. These are genuine negative findings." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims are hedged and qualitatively supported: 'LLM can be especially useful in converting ideations' is demonstrated through several examples, 'GPT4 is capable of analyzing microscopy images in a generic sense' is shown in Figure 6, and 'GPT4 suffers from inability to extend beyond basic analyses' is discussed in the Necessity of Science-LLM section. The claims are appropriately tentative." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper claims the integration 'enables more efficient and reproducible experiments' and 'can potentially significantly increase the pace of scientific research.' These causal claims about efficiency and acceleration are not supported by any controlled comparison or quantitative evidence — only by qualitative demonstrations." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper tests ChatGPT4 on one custom API (AEcroscoPy) and two commercial APIs for SPM, but the title and conclusions generalize to 'Microscopy Operation and Automated Experiment Design' broadly. The conclusions claim the synergy 'will open new door for accelerating scientific research' — far beyond the SPM domain tested." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "No alternative explanations are discussed. For example, the paper does not consider whether the tasks succeeded because GPT4's training data included similar API documentation and code examples, or whether the success is specific to well-documented APIs with Python interfaces." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper measures whether GPT4 generates seemingly correct code snippets and uses this as evidence that LLMs 'enable more efficient and reproducible experiments' and 'accelerate scientific research.' The gap between generating code that looks correct in a conversational setting and actually accelerating real research workflows is not acknowledged." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper uses 'ChatGPT4' throughout without specifying an exact model version, snapshot date, or API version (e.g., gpt-4-0613). Model behavior varies across versions." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "The actual prompts (user queries) and ChatGPT4 responses are shown in Figures 2, 3, S1-S9, and supplemental notes. The user inputs are the actual text used, not just natural-language descriptions of what was asked." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": false, 156 "justification": "No hyperparameters are reported — no temperature, top-p, max tokens, or other ChatGPT4 API settings are mentioned." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "No agentic scaffolding is used. The approach is straightforward in-context learning: feeding an API guideline document into ChatGPT4 and then issuing queries. There is no tool use, retry logic, memory management, or multi-step agent architecture." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": false, 166 "justification": "The paper states the AEcroscoPy guideline was 'fed' into ChatGPT4 but does not document how (via system prompt, file upload, or pasting), whether the full guideline or excerpts were used, or how the microscopy data (HDF files) were prepared for ChatGPT4 analysis." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": true, 173 "justification": "The 'Necessity of Science-LLM for Scientific Interpretation' section functions as a limitations discussion, detailing that AE-GPT's scientific interpretation is 'very limited' and that it lacks the ability to design advanced experiments from literature. The conclusions further note that 'experimental design capabilities of AE-GPT are generic, and the data analysis especially the interpretation of scientific data is basic.'" 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "No specific threats to validity are discussed. The paper does not consider issues such as whether outputs might look correct but contain subtle errors, whether results are reproducible across ChatGPT4 sessions, or whether the authors' domain expertise biased their assessment of code correctness." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "No explicit scope boundaries are stated. The paper does not specify what it is NOT claiming (e.g., that this does not prove LLMs can replace expert microscopists, or that results are specific to ChatGPT4 and may not hold for other LLMs). The conclusions trend toward broad generalizations rather than bounded claims." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": false, 190 "justification": "No raw data is available. ChatGPT4 conversation logs, generated scripts, microscopy data files, and the AEcroscoPy guideline document used are not released for independent verification." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "The paper describes the process: the AEcroscoPy guideline was fed to ChatGPT4 as a learning resource, then specific queries were issued for each task category (command conversion, workflow programming, experiment reproduction, data analysis). The process for each task is narrated in the text." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants are involved. The study evaluates an LLM on author-designed tasks, not a standard benchmark." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": false, 205 "justification": "The pipeline from feeding the guideline to ChatGPT4 through to evaluating outputs is not formally documented. It is unclear how the evaluation tasks were selected, whether all queries are reported (or only successful ones), and how code correctness was verified." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Funding is disclosed in the Acknowledgements: 'This research was supported by the Center for Nanophase Materials Sciences (CNMS), which is a US Department of Energy, Office of Science User Facility at Oak Ridge National Laboratory.'" 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "All authors are affiliated with the Center for Nanophase Materials Sciences, Oak Ridge National Laboratory, which is clearly stated. The authors developed AEcroscoPy, the custom API being used, and this is disclosed via references." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": true, 222 "justification": "The funder (US DOE) has no financial interest in whether ChatGPT4 works well for microscopy automation. The funding supports the research facility broadly, not a specific outcome regarding LLM utility." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": true, 227 "justification": "The paper includes an explicit conflict of interest statement: 'The authors declare no conflict of interest.'" 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": false, 233 "answer": false, 234 "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. It tests ChatGPT4's ability to generate code for a specific API given its documentation, which is a code generation task rather than a benchmark evaluation." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": false, 238 "answer": false, 239 "justification": "No benchmark evaluation is performed. The tasks are novel, author-designed queries about a custom API (AEcroscoPy), not standardized benchmarks." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": false, 243 "answer": false, 244 "justification": "No benchmark evaluation is performed. The study is a qualitative demonstration of LLM utility with custom tasks." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants are involved in this study." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants are involved in this study." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants are involved in this study." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants are involved in this study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants are involved in this study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants are involved in this study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants are involved in this study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "No inference costs are reported — no API costs, tokens consumed, or time per query are mentioned despite using a commercial LLM (ChatGPT4) for all experiments." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "No computational budget is stated. The total cost of ChatGPT4 usage for the experiments is not quantified." 294 } 295 } 296 }, 297 "claims": [ 298 { 299 "claim": "LLMs (ChatGPT4) can convert human-language experimental workflow descriptions into executable Python code for microscope APIs.", 300 "evidence": "Demonstrated through multiple examples: individual command conversion (Figures 3, S1), BE line scan workflow (Figure 3a), custom trajectory design (Figure 3b-c), and multifaceted workflow programming (Figure 4, Figures S3-S5). The generated code uses correct API functions in proper sequence.", 301 "supported": "moderate" 302 }, 303 { 304 "claim": "AE-GPT can summarize experiment details from logger files and reproduce experiments from both logs and published literature.", 305 "evidence": "Demonstrated in Figure 5: AE-GPT translates a logger file to natural language (Figure 5a) and extracts BE-PFM parameters from a published paper to generate reproduction code (Figure 5b, Figures S7-S8).", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "GPT4 can analyze BEPFM data including identifying parameters, plotting results, finding maximum intensity spectra, and calculating roughness.", 310 "evidence": "Shown in Figure 6 and Figure S9 where AE-GPT processes HDF data files and performs basic analyses upon request.", 311 "supported": "weak" 312 }, 313 { 314 "claim": "GPT4's scientific interpretation ability is very limited and it cannot design advanced experiments from literature.", 315 "evidence": "Discussed in 'Necessity of Science-LLM for Scientific Interpretation' section and Supplemental Notes I and II: 'The interpretation is mostly just the definition of certain scientific concept' and 'the designed experiments do not harness knowledge from the provided literatures.'", 316 "supported": "moderate" 317 }, 318 { 319 "claim": "The integration of LLMs and APIs 'can potentially significantly increase the pace of scientific research.'", 320 "evidence": "No quantitative evidence provided. Supported only by the qualitative demonstrations of code generation, which show ChatGPT4 can produce functional code snippets but do not measure actual research acceleration.", 321 "supported": "weak" 322 } 323 ], 324 "red_flags": [ 325 { 326 "flag": "No quantitative evaluation", 327 "detail": "The entire paper relies on qualitative assessment by the authors. No success rates, time comparisons, code correctness metrics, or any numerical evaluation is provided. The reader must trust the authors' subjective judgment that outputs are 'correct' or 'insightful.'" 328 }, 329 { 330 "flag": "Cherry-picked demonstrations", 331 "detail": "It is unclear whether all queries attempted are reported. The paper may show only successful examples while omitting failures. The selection of test tasks appears ad-hoc with no systematic sampling strategy." 332 }, 333 { 334 "flag": "Claims significantly outrun evidence", 335 "detail": "The paper's claims about 'accelerating scientific research,' 'enabling effective sharing of experimental protocols,' and serving as 'FAIR infrastructure' are far-reaching relative to the evidence, which consists of a handful of qualitative code generation demonstrations on one custom and two commercial APIs." 336 }, 337 { 338 "flag": "Self-evaluation bias", 339 "detail": "The authors developed AEcroscoPy and are evaluating ChatGPT4's ability to use their own API. They are uniquely positioned to judge code correctness but also uniquely biased — they may unconsciously rate outputs more favorably or design queries that align with what ChatGPT4 can do." 340 }, 341 { 342 "flag": "No reproducibility of LLM outputs", 343 "detail": "ChatGPT4 outputs are non-deterministic. No temperature settings are reported, no repeated trials are conducted, and no analysis of output consistency is performed. The specific model version is not stated, making future reproduction impossible." 344 } 345 ], 346 "cited_papers": [ 347 { 348 "title": "GPT-4 Technical Report", 349 "authors": ["OpenAI"], 350 "year": 2023, 351 "arxiv_id": "2303.08774", 352 "relevance": "Foundational LLM model used throughout this study for code generation and scientific analysis tasks." 353 }, 354 { 355 "title": "Natural language processing models that automate programming will transform chemistry research and teaching", 356 "authors": ["G. M. Hocky", "A. D. White"], 357 "year": 2022, 358 "relevance": "Early argument for LLMs automating scientific programming, directly relevant to the survey's scope on LLM-assisted code generation for research." 359 }, 360 { 361 "title": "Opportunities for Retrieval and Tool Augmented Large Language Models in Scientific Facilities", 362 "authors": ["M. H. Prince"], 363 "year": 2023, 364 "arxiv_id": "2312.01291", 365 "relevance": "Directly relevant work on LLM tool augmentation for scientific instrumentation, closely related to the agentic AI workflow theme." 366 }, 367 { 368 "title": "14 examples of how LLMs can transform materials science and chemistry: a reflection on a large language model hackathon", 369 "authors": ["K. M. Jablonka"], 370 "year": 2023, 371 "relevance": "Explores diverse LLM applications in materials science including property prediction and code generation, relevant to LLM capability assessment." 372 }, 373 { 374 "title": "Summary of chatgpt-related research and perspective towards the future of large language models", 375 "authors": ["Y. Liu"], 376 "year": 2023, 377 "relevance": "Survey of ChatGPT research applications relevant to understanding the landscape of LLM capability claims." 378 }, 379 { 380 "title": "AEcroscoPy: A software-hardware framework empowering microscopy toward automated and autonomous experimentation", 381 "authors": ["Y. Liu"], 382 "year": 2023, 383 "arxiv_id": "2312.10281", 384 "relevance": "The custom API framework used in this study, relevant as an example of programmatic scientific instrument control that LLMs interface with." 385 }, 386 { 387 "title": "Galactica: A large language model for science", 388 "authors": ["R. Taylor"], 389 "year": 2022, 390 "arxiv_id": "2211.09085", 391 "relevance": "Domain-specific scientific LLM relevant to the paper's argument that fine-tuned models would outperform general LLMs for scientific tasks." 392 }, 393 { 394 "title": "SciBERT: A pretrained language model for scientific text", 395 "authors": ["I. Beltagy", "K. Lo", "A. Cohan"], 396 "year": 2019, 397 "arxiv_id": "1903.10676", 398 "relevance": "Domain-specific scientific language model, cited as evidence that specialized LLMs could address limitations found in general-purpose models for scientific interpretation." 399 } 400 ], 401 "engagement_factors": { 402 "practical_relevance": { 403 "score": 2, 404 "justification": "Researchers operating scanning probe microscopes could adopt this approach of feeding API documentation to ChatGPT4 for workflow generation, though it requires specific hardware and API access." 405 }, 406 "surprise_contrarian": { 407 "score": 0, 408 "justification": "Confirms the widely expected finding that LLMs can generate code from documentation and help with API usage, with no surprising or contrarian elements." 409 }, 410 "fear_safety": { 411 "score": 0, 412 "justification": "No AI risk or safety concerns are raised; the paper is about assisting scientific instrumentation." 413 }, 414 "drama_conflict": { 415 "score": 0, 416 "justification": "No controversy or conflict; straightforward demonstration paper." 417 }, 418 "demo_ability": { 419 "score": 1, 420 "justification": "AEcroscoPy is published as a package but the LLM integration has no standalone demo; reproducing requires physical microscopy equipment." 421 }, 422 "brand_recognition": { 423 "score": 1, 424 "justification": "Uses ChatGPT4 (well-known) but the lab (ORNL), venue (ML:ST), and application domain (SPM) are niche." 425 } 426 } 427 }