scan.json (27267B)
1 { 2 "paper": { 3 "title": "Leveraging large language models for data analysis automation", 4 "authors": [ 5 "Jacqueline A Jansen", 6 "Artür Manukyan", 7 "Nour Al Khoury", 8 "Altuna Akalin" 9 ], 10 "year": 2023, 11 "venue": "bioRxiv", 12 "doi": "10.1101/2023.12.11.571140" 13 }, 14 "scan_version": 2, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval", "case-study"], 17 "key_findings": "The mergen R package enables LLM-based code generation for data analysis via prompt engineering and self-correction. Task complexity reduces code executability across all strategies. CoT and ActAs prompt engineering did not improve executability over simple prompting, while self-correction combined with data file inclusion was the most effective strategy. GPT-4 outperformed GPT-3.5 but still struggled with complex multi-step tasks.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "Code is released: mergen on CRAN and GitHub (https://github.com/BIMSBbioinfo/mergen), mergenstudio on GitHub (https://github.com/BIMSBbioinfo/mergenstudio), and manuscript reproduction code at https://github.com/BIMSBbioinfo/mergen-manuscript." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper states: 'The code and data to reproduce the analysis at the results section is available at https://github.com/BIMSBbioinfo/mergen-manuscript.' The evaluation tasks are also available there." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "The paper mentions R package dependencies (httr, openai, shiny) but provides no R version requirement, no environment specification file, and no detailed dependency versions beyond package names." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper points to the manuscript repository and the mergen website's 'Getting Started' section, but provides no step-by-step reproduction instructions in the paper itself. A researcher would need to reverse-engineer the experimental setup from the code." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "Despite running each prompt 10 times, no confidence intervals or error bars are reported. Figures show aggregate fractions of executable tasks without uncertainty quantification." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper claims GPT-4 shows 'notable improvement' over GPT-3.5 and that self-correction is 'the most effective' strategy, but no statistical significance tests are used to support these comparative claims." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": false, 55 "justification": "Results are reported as fractions of executable tasks per complexity level, but no effect sizes (e.g., improvement magnitude with context) are provided for comparisons between strategies or models." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "The evaluation uses N=20 prompts with n=10 cycles. No justification is given for why 20 tasks or 10 repetitions were chosen, and no power analysis is discussed." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "Each prompt was run 10 times to 'account for variability,' but no variance, standard deviation, or spread measure is reported across these runs. Only aggregate executability fractions are shown." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "The 'simple' prompting strategy serves as the baseline, against which CoT, ActAs, fileCont, and selfCorrect strategies are compared. GPT-3.5 serves as baseline for the GPT-4 comparison." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "GPT-3.5 and GPT-4 were state-of-the-art models at the time of writing (2023). The prompt engineering techniques (CoT, Act As) are contemporary methods." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "The study systematically tests different components: simple → +CoT, +ActAs, +fileCont, +selfCorrect, each adding a feature to measure its contribution to code executability." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": false, 87 "justification": "Code executability is the only metric used. The paper explicitly acknowledges: 'code executability was evaluated as the sole metric for LLM accuracy, overall task adequateness was not assessed.'" 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "Evaluation is entirely automated (code execution success/failure). The paper acknowledges that 'overall task adequateness was not assessed,' which would have required human evaluation of whether the code produced correct analytical results." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": false, 97 "justification": "All 20 tasks are used for both evaluation and drawing conclusions. There is no separation into development and test sets, and claims about which strategy is 'best' are made on the same data used for all comparisons." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Results are broken down by task complexity level (1-5) in Figures 3-7, showing executability rates per complexity category." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper discusses failure patterns: complex tasks fail more, file content inclusion sometimes increases errors for complexity 4+, and GPT-4 still fails on 'tasks that required sophisticated data integration, or multi-step data analysis.'" 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper reports that CoT and ActAs prompt engineering 'did not result in a decreased error rate' (Figure 4), and that file content inclusion 'in some cases seemed to increase the error rate' for complex tasks." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims LLMs 'effectively generate code for some data analysis tasks' but 'challenges remain in executable code generation, especially for complex data analysis tasks.' The results in Figures 3-7 support both claims." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "Claims like 'self-correction improved performance' and 'GPT-4 demonstrates improvement over GPT-3.5' are supported by controlled single-variable comparisons: each prompt strategy is tested independently holding other variables constant, and the model comparison uses the same strategy (selfCorrect)." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The title is 'Leveraging large language models for data analysis automation' but tests only GPT-3.5/GPT-4 on 20 bioinformatics tasks in R. Claims about 'LLM capabilities and limitations' are not bounded to the tested models, domain, or programming language." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "No alternative explanations are discussed for the results. Why CoT/ActAs didn't improve executability, why file content sometimes hurt performance for complex tasks, and what other factors might explain model differences are not explored." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper measures code executability but frames the work as 'data analysis automation.' They briefly note 'overall task adequateness was not assessed' but continue to frame executable code as sufficient for data analysis without substantively discussing what data analysis quality actually requires beyond execution." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper lists available models (gpt-4, gpt-4-0314, gpt-3.5-turbo, etc.) but for experiments reports only 'GPT-3.5-Turbo and GPT-4' without specifying which exact version/snapshot was used. Model behavior changes across versions." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "System prompts are provided verbatim (e.g., 'Act as an expert bioinformatician and R user...'). Example task prompts are shown in the paper. Full task set is available at the manuscript repository." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": false, 156 "justification": "No API hyperparameters (temperature, top-p, max_tokens) are reported despite making multiple LLM API calls. These significantly affect output quality and reproducibility." 157 }, 158 "scaffolding_described": { 159 "applies": true, 160 "answer": true, 161 "justification": "The self-correction mechanism is described: error capture → new prompt with error message → LLM generates corrected code → iterate up to N attempts. The full workflow (prompt augmentation → code generation → extraction → dependency installation → execution) is documented." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "The task complexity classification scheme (5 components: file reading, data wrangling, visualization, ML/statistics, multiple datasets) is described. Code extraction and cleaning steps (clean_code_blocks, extractCode with delimiters) are documented." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": false, 173 "justification": "No dedicated limitations section exists. Some limitations are mentioned inline in the Discussion (e.g., executability as sole metric, GPT-4 still limited on complex tasks) but there is no substantive, structured discussion." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "No threats-to-validity section. The Discussion mentions some limitations but does not identify specific threats to validity for this study (e.g., API non-determinism, task selection bias, single-metric evaluation)." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "No explicit statements about what the results do NOT show. The paper does not bound its claims to specific domains, languages, or model families, despite testing only bioinformatics tasks in R with OpenAI models." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": true, 190 "justification": "The manuscript repository (https://github.com/BIMSBbioinfo/mergen-manuscript) contains the code and data to reproduce the analysis, making raw experimental data available for verification." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "The paper describes the experimental setup: 20 tasks of varying complexity, each run 10 times per condition, evaluated on code executability. Task types (bioinformatics data analysis) and complexity classification are described." 196 }, 197 "recruitment_methods_described": { 198 "applies": true, 199 "answer": false, 200 "justification": "The tasks are described as 'a curated dataset that represents a spectrum of tasks common in bioinformatics' but no selection or curation criteria are described. How the 20 tasks were chosen and whether they are representative is unexplained." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "The pipeline from prompt creation through LLM interaction, code extraction, dependency installation, execution, and executability evaluation is documented with function-level detail (sendPrompt, extractCode, executeCode, selfcorrect)." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding acknowledgments section and no mention of funding sources anywhere in the paper." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Author affiliations are clearly listed: Max Delbrück Center for Molecular Medicine (MDC), University of Potsdam, Free University of Berlin. Authors are not affiliated with the evaluated product (OpenAI)." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "No funding information is disclosed, so independence of the funder cannot be assessed. Absence of disclosure is not evidence of absence of conflict." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests statement is present in the paper. Absence of disclosure is not the same as absence of conflict." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": false, 234 "justification": "The paper evaluates GPT-3.5 and GPT-4 on custom bioinformatics tasks but does not state the models' training data cutoff dates." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": false, 239 "justification": "No discussion of whether the bioinformatics tasks or similar data analysis patterns could have appeared in GPT's training data. Common R analysis patterns are widely available online." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": false, 244 "justification": "Despite using tasks that likely involve common data analysis patterns available in GPT training data, no contamination analysis or discussion is provided." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants. The study evaluates LLM-generated code on computational tasks." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants. The study is a computational evaluation of LLM code generation." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "The study makes hundreds of API calls (20 prompts × 10 cycles × multiple strategies × 2 models) but reports no costs, token counts, or latency measurements." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "No total computational budget (API spend, total tokens consumed, wall-clock time for experiments) is reported despite extensive API usage." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "Each prompt is run 10 times to 'account for variability' but results across runs are aggregated into fractions. No per-run variation or seed sensitivity analysis is reported." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": true, 305 "justification": "Clearly stated: 'we ran the same prompt 10 times' and figures annotated with 'N=20 individual prompts over n=10 cycles.'" 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "No hyperparameter search is described. API parameters (temperature, top-p) are not even reported, let alone a search budget for them." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": false, 315 "justification": "Self-correction is declared 'the most effective' strategy based on the same data used for all comparisons. No validation set is used for strategy selection." 316 }, 317 "multiple_comparison_correction": { 318 "applies": true, 319 "answer": false, 320 "justification": "Multiple strategies are compared (simple, CoT, ActAs, fileCont, selfCorrect) across complexity levels without any multiple comparison correction. No statistical tests are performed at all." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors evaluate their own package (mergen) and its self-correction mechanism without acknowledging the bias of evaluating their own system or having independent evaluation." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": false, 330 "justification": "Self-correction uses multiple API calls per task (up to 3 correction attempts) compared to simple prompting (1 call), but this compute cost difference is not discussed or compared against the performance gain." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": false, 335 "justification": "Code executability is used as the sole measure of LLM effectiveness for data analysis. The paper does not discuss whether 'code runs without errors' actually measures 'code correctly performs the intended analysis.'" 336 }, 337 "scaffold_confound_addressed": { 338 "applies": true, 339 "answer": true, 340 "justification": "For the GPT-3.5 vs GPT-4 model comparison, the same scaffold (selfCorrect strategy) is used for both models, controlling for the scaffolding confound. Prompt strategy comparisons intentionally vary the scaffold as the experimental variable." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of temporal leakage. Common R/bioinformatics analysis patterns are widely available online and likely present in GPT training data, but this is not addressed." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "The fileCont strategy adds data file content to prompts, providing extra information. Whether this constitutes feature leakage relative to realistic usage is not discussed." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether the custom bioinformatics tasks share structural similarities with examples in GPT's training data." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No leakage detection or prevention method is applied." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "Task complexity reduces code executability: as tasks escalate from basic data reading to more advanced analytical procedures, the fraction of executable code decreases.", 369 "evidence": "Figures 3A and 3B show a clear downward trend in executability as complexity increases from 1 to 5 across all prompt strategies (N=20 prompts, n=10 cycles).", 370 "supported": "strong" 371 }, 372 { 373 "claim": "CoT and ActAs prompt engineering techniques do not improve code executability over simple prompting.", 374 "evidence": "Figure 4 shows no improvement in executability fractions for CoT or ActAs compared to the simple strategy across all complexity levels.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Including data file content in prompts improves executability for moderate-complexity tasks but not for high-complexity tasks.", 379 "evidence": "Figure 5 shows improved executability with fileCont for complexity 1-3 but increased error rates for complexity 4+.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "The self-correction mechanism is the most effective strategy for improving code executability.", 384 "evidence": "Figure 6 shows selfCorrect outperforming all other strategies across complexity levels, though no statistical tests confirm significance.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "GPT-4 demonstrates notable improvement over GPT-3.5 in generating executable code for data analysis tasks.", 389 "evidence": "Figure 7 shows GPT-4 with higher executability fractions than GPT-3.5 across complexity levels using the selfCorrect strategy, but no statistical tests are reported.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "LLMs cannot yet consistently replace domain experts for generation of executable code in complex data analysis tasks.", 394 "evidence": "Even GPT-4 with self-correction shows declining executability for complex tasks (complexity 4-5) as shown in Figure 7B.", 395 "supported": "strong" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "Single metric evaluation", 401 "detail": "Code executability is the only evaluation metric. Whether the generated code produces correct analytical results (task adequateness) is never assessed. Executable code that produces wrong results would be counted as a success." 402 }, 403 { 404 "flag": "No statistical tests or uncertainty quantification", 405 "detail": "Despite 10 repetitions per task, no variance, error bars, confidence intervals, or significance tests are reported. Claims about which strategy is 'best' and model superiority rest on visual inspection of bar charts." 406 }, 407 { 408 "flag": "Authors evaluate own system without acknowledging bias", 409 "detail": "The authors developed mergen and its self-correction mechanism, then evaluated it as the most effective approach. No independent evaluation or acknowledgment of author-evaluation bias." 410 }, 411 { 412 "flag": "No hyperparameter reporting", 413 "detail": "Temperature, top-p, and other API parameters are not reported. These significantly affect LLM output quality and make the experiments non-reproducible." 414 }, 415 { 416 "flag": "Very small task set", 417 "detail": "Only 20 tasks are used for evaluation. With 5 complexity levels, some levels may have very few tasks. No justification for sample size is provided." 418 }, 419 { 420 "flag": "No cost reporting despite extensive API usage", 421 "detail": "Hundreds of API calls across strategies and models, but no cost or token usage is reported, limiting practical applicability assessment." 422 } 423 ], 424 "cited_papers": [ 425 { 426 "title": "Evaluating Large Language Models Trained on Code", 427 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 428 "year": 2021, 429 "arxiv_id": "2107.03374", 430 "relevance": "Foundational work on LLM code generation evaluation (Codex/HumanEval), directly relevant to LLM coding capabilities." 431 }, 432 { 433 "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models", 434 "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"], 435 "year": 2023, 436 "arxiv_id": "2307.09288", 437 "relevance": "Major open-source LLM family relevant to understanding model capabilities for code generation." 438 }, 439 { 440 "title": "GitHub Copilot AI pair programmer: Asset or Liability?", 441 "authors": ["Arghavan Moradi Dakhel", "Vahid Majdinasab", "Amin Nikanjam"], 442 "year": 2022, 443 "arxiv_id": "2206.15331", 444 "relevance": "Empirical evaluation of GitHub Copilot's code generation quality, directly relevant to LLM-assisted programming." 445 }, 446 { 447 "title": "A Prompt Pattern Catalog to Enhance Prompt Engineering with ChatGPT", 448 "authors": ["Jules White", "Quchen Fu", "Sam Hays"], 449 "year": 2023, 450 "arxiv_id": "2302.11382", 451 "relevance": "Systematic catalog of prompt engineering techniques relevant to understanding prompting strategies for code generation." 452 }, 453 { 454 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 455 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 456 "year": 2022, 457 "arxiv_id": "2201.11903", 458 "relevance": "Foundational CoT prompting paper, directly tested in this study's evaluation of prompt engineering for code generation." 459 }, 460 { 461 "title": "LLMs for Science: Usage for Code Generation and Data Analysis", 462 "authors": ["Mohamed Nejjar", "Luca Zacharias", "Florian Stiehle"], 463 "year": 2023, 464 "arxiv_id": "2311.16733", 465 "relevance": "Directly related work benchmarking LLMs for scientific code generation and data analysis tasks." 466 }, 467 { 468 "title": "Bioinfo-Bench: A Simple Benchmark Framework for LLM Bioinformatics Skills Evaluation", 469 "authors": ["Qingyu Chen", "Chuanqi Deng"], 470 "year": 2023, 471 "doi": "10.1101/2023.10.18.563023", 472 "relevance": "Benchmark for evaluating LLM bioinformatics capabilities, directly relevant to domain-specific LLM code generation evaluation." 473 }, 474 { 475 "title": "BioMANIA: Simplifying bioinformatics data analysis through conversation", 476 "authors": ["Zhengyuan Dong", "Victor Zhong", "Yuning You Lu"], 477 "year": 2023, 478 "doi": "10.1101/2023.10.29.564479", 479 "relevance": "Conversational LLM interface for bioinformatics analysis, directly comparable to mergen's approach." 480 }, 481 { 482 "title": "Automated Bioinformatics Analysis via AutoBA", 483 "authors": ["Juexiao Zhou", "Bin Zhang", "Xiuying Chen"], 484 "year": 2023, 485 "doi": "10.1101/2023.09.08.556814", 486 "relevance": "Automated bioinformatics pipeline using LLMs, directly comparable system for LLM-based scientific data analysis." 487 } 488 ] 489 }