scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (23626B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Leveraging large language models for data analysis automation",
      6     "authors": [
      7       "Jacqueline A Jansen",
      8       "Artür Manukyan",
      9       "Nour Al Khoury",
     10       "Altuna Akalin"
     11     ],
     12     "year": 2023,
     13     "venue": "bioRxiv",
     14     "arxiv_id": null,
     15     "doi": "10.1101/2023.12.11.571140"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All abstract claims (expert shortage, mergen package capabilities, evaluation of LLM code generation) are demonstrated in the paper with supporting evidence.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Comparative claims about self-correction improving performance (Figure 6) and GPT-4 outperforming GPT-3.5 (Figure 7) are supported by systematic A/B evaluation on the same task set, repeated 10 times each.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Findings are explicitly bounded to bioinformatics data analysis tasks in R. Discussion concludes LLMs 'cannot consistently replace domain experts' for complex tasks, not claiming universal solutions.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Authors note that code executability (sole metric) may not capture task adequateness, and acknowledge that prompt engineering's lack of improvement might be due to this limitation rather than true ineffectiveness.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Paper clearly distinguishes between measuring 'code executability' (runs without error) versus 'task adequateness' (produces correct results), explicitly stating the latter was not assessed.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No dedicated limitations or threats-to-validity section exists. Caveats are scattered in results and discussion but not systematically presented.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Threats are mentioned informally (task adequateness not assessed, GPT-4 still fails on complex tasks) but not organized into specific, bounded threats-to-validity discussion.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Scope is explicit: bioinformatics tasks, R-based, 5-level complexity framework, evaluation on 20 prompts × 10 cycles. Title is broad but content is appropriately scoped.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding sources disclosed anywhere in the manuscript.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All author affiliations with Max Delbrück Center, University of Potsdam, and Free University of Berlin are listed.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Cannot determine independence without knowing funding source. Notably, paper evaluates OpenAI models but does not disclose whether OpenAI funded this work.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial disclosures statement present.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are well defined: 'executability' = code runs without errors; 'task complexity' = 5-level framework based on component count; prompt engineering techniques (Act As, CoT) explained with examples.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Contribution is explicit: (1) mergen R package for LLM-based code generation, (2) understanding of LLM capabilities/limitations, (3) practical recipes for integration into workflows.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Introduction cites prior LLM code generation work (Chen et al., Touvron et al.) and prompt engineering techniques (Wei et al., White et al.). Discussion compares to other LLM-based data analysis solutions (Nejjar et al., Dai et al., Chen & Deng).",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "mergen package freely available on CRAN and GitHub (https://github.com/BIMSBbioinfo/mergen); mergenstudio also on GitHub.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Test tasks and reproduction code available at https://github.com/BIMSBbioinfo/mergen-manuscript.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "Package is in R; dependencies (httr, openai packages) are standard CRAN packages. CRAN submission implies dependency specification, though specific versions/Dockerfile not provided.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": true,
    141           "justification": "Methods section provides step-by-step code examples for setupAgent(), sendPrompt(), extractCode(), executeCode(), and selfcorrect(). GitHub repo with reproducible analysis linked.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "No confidence intervals, error bars, significance tests, effect sizes, sample size justification, or variance reporting. Results aggregated across 10 runs but spread not shown. Answer is collective FALSE.",
    149         "source": "haiku"
    150       },
    151       "confidence_intervals_or_error_bars": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Figures 3-7 show only point estimates (fraction executable or yes/no) without error bars or confidence intervals.",
    155         "source": "haiku"
    156       },
    157       "significance_tests": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No formal statistical tests comparing strategies (simple vs CoT vs selfCorrect, or GPT-3.5 vs GPT-4).",
    161         "source": "haiku"
    162       },
    163       "effect_sizes_reported": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "No effect sizes; results shown as fractions (e.g., '80% executable') but no standardized effect size metrics.",
    167         "source": "haiku"
    168       },
    169       "sample_size_justified": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "Each prompt run 10 times (n=10 cycles) with N=20 prompts, but no justification given for these choices. No power analysis.",
    173         "source": "haiku"
    174       },
    175       "variance_reported": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "Variance/std dev across 10 runs not reported. Aggregated results only.",
    179         "source": "haiku"
    180       },
    181       "evaluation_design": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Systematic A/B comparison of prompt strategies and LLM models, with task breakdowns by complexity, failure case analysis, and negative results reported (CoT/ActAs didn't help as expected).",
    185         "source": "haiku"
    186       },
    187       "baselines_included": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Baselines: simple prompt, CoT, ActAs, fileCont, selfCorrect; also GPT-3.5 vs GPT-4 comparison.",
    191         "source": "haiku"
    192       },
    193       "baselines_contemporary": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "GPT-3.5-turbo and GPT-4 were state-of-the-art models in Dec 2023 when paper was posted.",
    197         "source": "haiku"
    198       },
    199       "ablation_study": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Prompt strategies tested incrementally: simple, then +CoT, +ActAs, +fileCont, +selfCorrect. Shows contribution of self-correction mechanism across all complexity levels.",
    203         "source": "haiku"
    204       },
    205       "multiple_metrics": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Code executability (primary), response length (proxy for complexity), and error rate examined across multiple prompt strategies and model comparisons.",
    209         "source": "haiku"
    210       },
    211       "human_evaluation": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "Evaluation is automatic (code runs or not). Paper acknowledges 'overall task adequateness was not assessed'—no human judgment of whether code solves the task correctly.",
    215         "source": "haiku"
    216       },
    217       "held_out_test_set": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "Not a prediction task. Same tasks run 10 times but no train/test split.",
    221         "source": "haiku"
    222       },
    223       "per_category_breakdown": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Results broken down by task complexity level (1–5), prompt strategy, and model. Per-complexity analysis drives main findings.",
    227         "source": "haiku"
    228       },
    229       "failure_cases_discussed": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "Explicitly discussed: tasks with complexity ≥4 have low success rates; GPT-4 fails on multi-step data integration tasks; file content inclusion backfires on complex tasks.",
    233         "source": "haiku"
    234       },
    235       "negative_results_reported": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "Key negative result: 'Contrary to our initial hypothesis, our results indicate that more complex prompt engineering techniques do not necessarily lead to a marked improvement.' CoT and ActAs didn't reduce error rate.",
    239         "source": "haiku"
    240       },
    241       "setup_transparency": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Model versions listed (gpt-4, gpt-4-0314, gpt-3.5-turbo, etc.); example prompts provided in Methods; mergen scaffolding (error feedback, self-correction) described. Missing: LLM hyperparameters (temperature, top-p).",
    245         "source": "haiku"
    246       },
    247       "model_versions_specified": {
    248         "applies": true,
    249         "answer": true,
    250         "justification": "Both OpenAI chat/completion models and specific versions (gpt-4-0314, gpt-3.5-turbo-0301) listed. Final experiments use 'GPT-3.5' and 'GPT-4' generically but reasonable for 2023.",
    251         "source": "haiku"
    252       },
    253       "prompts_provided": {
    254         "applies": true,
    255         "answer": true,
    256         "justification": "Example prompts shown ('How do I perform PCA on data in my file called test.txt?'); context instructions fully specified ('Act as an expert bioinformatician...').",
    257         "source": "haiku"
    258       },
    259       "hyperparameters_reported": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "Temperature, top-p, max_tokens, and other LLM hyperparameters not reported.",
    263         "source": "haiku"
    264       },
    265       "scaffolding_described": {
    266         "applies": true,
    267         "answer": true,
    268         "justification": "mergen scaffolding fully described: prompt engineering (Act As, CoT, context), error feedback loops, automatic dependency resolution, self-correction mechanism.",
    269         "source": "haiku"
    270       },
    271       "data_preprocessing_documented": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Data pipeline documented: file reading, data wrangling, visualization, ML/stats application. Specific preprocessing steps for test tasks not detailed but tasks themselves available on GitHub.",
    275         "source": "haiku"
    276       },
    277       "data_integrity": {
    278         "applies": true,
    279         "answer": true,
    280         "justification": "Test data and code available at GitHub; data collection described as curated bioinformatics task set. Task selection process not fully documented but reproducibility link provided.",
    281         "source": "haiku"
    282       },
    283       "raw_data_available": {
    284         "applies": true,
    285         "answer": true,
    286         "justification": "GitHub repo (mergen-manuscript) contains tasks and data to reproduce analysis.",
    287         "source": "haiku"
    288       },
    289       "data_collection_described": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Tasks described as 'typical bioinformatics data analysis tasks' with 5-component complexity framework. Specific selection criteria for 20 prompts not stated but all available for inspection.",
    293         "source": "haiku"
    294       },
    295       "recruitment_methods_described": {
    296         "applies": false,
    297         "answer": false,
    298         "justification": "N/A—no human subjects; tasks are synthetic.",
    299         "source": "haiku"
    300       },
    301       "data_pipeline_documented": {
    302         "applies": true,
    303         "answer": true,
    304         "justification": "Full pipeline documented: task description → prompt → LLM response → code extraction → execution → result evaluation.",
    305         "source": "haiku"
    306       },
    307       "contamination": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Training cutoff dates for GPT-3.5/GPT-4 not mentioned; no discussion of whether models saw bioinformatics tasks during training. Custom tasks reduce risk but lack of discussion is a gap.",
    311         "source": "haiku"
    312       },
    313       "training_cutoff_stated": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "OpenAI model training cutoff dates not stated.",
    317         "source": "haiku"
    318       },
    319       "train_test_overlap_discussed": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "No discussion of whether bioinformatics tasks overlap with LLM training data.",
    323         "source": "haiku"
    324       },
    325       "benchmark_contamination_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "Not addressed; tasks are custom so risk is lower than standard benchmarks, but no explicit discussion.",
    329         "source": "haiku"
    330       },
    331       "cost_and_practicality": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "Neither inference cost (API calls × model pricing) nor compute budget reported. Self-correction cycles increase cost, not quantified.",
    335         "source": "haiku"
    336       },
    337       "inference_cost_reported": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "No mention of OpenAI API cost per call or total evaluation budget.",
    341         "source": "haiku"
    342       },
    343       "compute_budget_stated": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "Total computational cost not stated (no. of API calls, cost, latency).",
    347         "source": "haiku"
    348       }
    349     }
    350   },
    351   "claims": [
    352     {
    353       "claim": "Code executability decreases sharply with task complexity",
    354       "evidence": "Figures 3B, 6B, 7B show fraction of executable tasks declining from ~90% (complexity 1–2) to ~20% (complexity 4–5)",
    355       "supported": "strong"
    356     },
    357     {
    358       "claim": "Self-correction mechanism significantly improves code executability across all complexity levels",
    359       "evidence": "Figure 6 shows selfCorrect strategy outperforms simple, CoT, ActAs, fileCont across complexity levels",
    360       "supported": "strong"
    361     },
    362     {
    363       "claim": "Prompt engineering techniques (CoT, ActAs) do not improve code executability",
    364       "evidence": "Figure 4: CoT and ActAs show similar error rates to simple prompting, contrary to initial hypothesis",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Including file content in prompts improves performance for moderate complexity tasks but degrades for complex tasks",
    369       "evidence": "Figure 5: fileCont shows mixed results; benefits at complexity 2–3, worse at complexity ≥4",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "GPT-4 significantly outperforms GPT-3.5 in generating executable code",
    374       "evidence": "Figure 7 shows GPT-4 executable fraction consistently higher than GPT-3.5-turbo across all complexity levels",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "LLMs cannot reliably generate executable code for complex bioinformatics data analysis tasks",
    379       "evidence": "Discussion: 'GPT-4's performance did not consistently result in executable code for many of the more complex tasks'; success rate <50% at complexity 4–5",
    380       "supported": "moderate"
    381     }
    382   ],
    383   "methodology_tags": [
    384     "empirical",
    385     "benchmark-eval"
    386   ],
    387   "key_findings": "The mergen R package successfully integrates LLMs for data analysis code generation, with self-correction mechanisms proving most effective at improving executability. Code executability correlates inversely with task complexity, dropping from ~90% on simple tasks to ~20% on complex tasks. Contrary to expectations, advanced prompt engineering (Chain of Thought, Act As) offers no improvement over simple prompting, though including file content and self-correction loops do. GPT-4 substantially outperforms GPT-3.5, yet still fails on multi-step analytical workflows, indicating LLMs require significant scaffolding before replacing domain experts.",
    388   "red_flags": [
    389     {
    390       "flag": "No statistical significance testing",
    391       "detail": "Comparisons between strategies (simple vs selfCorrect, GPT-3.5 vs GPT-4) lack formal statistical tests; visual differences could be noise given no reported confidence intervals or p-values"
    392     },
    393     {
    394       "flag": "No human evaluation of correctness",
    395       "detail": "Only code executability measured (runs without error). Paper acknowledges task adequateness not assessed—does generated code actually solve the problem correctly?"
    396     },
    397     {
    398       "flag": "No error bars or confidence intervals",
    399       "detail": "Results aggregated across 10 runs per prompt but spread not shown. Reproducibility of results unclear"
    400     },
    401     {
    402       "flag": "Sample size not justified",
    403       "detail": "10 cycles × 20 prompts chosen arbitrarily without power analysis or justification"
    404     },
    405     {
    406       "flag": "Missing LLM hyperparameters",
    407       "detail": "Temperature, top-p, max_tokens not reported; reproducibility of LLM calls compromised"
    408     },
    409     {
    410       "flag": "Training data contamination not discussed",
    411       "detail": "Could GPT-3.5/4 have seen bioinformatics tasks during training? Custom tasks lower risk but lack of discussion is a gap"
    412     },
    413     {
    414       "flag": "No funding or competing interests disclosure",
    415       "detail": "Paper evaluates OpenAI models extensively but does not disclose whether OpenAI funded work or if authors have financial interest"
    416     },
    417     {
    418       "flag": "Cost and practicality not reported",
    419       "detail": "API costs for evaluation not stated; self-correction cycles will multiply cost in practice but magnitude not quantified"
    420     },
    421     {
    422       "flag": "Overly broad title vs narrow scope",
    423       "detail": "'Leveraging LLMs for data analysis automation' suggests general applicability, but evaluation limited to bioinformatics tasks in R"
    424     },
    425     {
    426       "flag": "No dedicated limitations section",
    427       "detail": "Caveats scattered throughout; systematic threats-to-validity discussion absent"
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "Evaluating Large Language Models Trained on Code",
    433       "authors": "Chen et al.",
    434       "year": 2021,
    435       "relevance": "Foundational work on LLM code generation capabilities; cited for evidence that LLMs effectively generate code snippets"
    436     },
    437     {
    438       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    439       "authors": "Wei et al.",
    440       "year": 2022,
    441       "relevance": "Seminal prompt engineering technique tested in this paper; key baseline for prompt strategy comparison"
    442     },
    443     {
    444       "title": "A Prompt Pattern Catalog to Enhance Prompt Engineering with ChatGPT",
    445       "authors": "White et al.",
    446       "year": 2023,
    447       "relevance": "Comprehensive prompt patterns; mergen's 'Act As' approach draws from this work"
    448     },
    449     {
    450       "title": "LLMs for Science: Usage for Code Generation and Data Analysis",
    451       "authors": "Nejjar et al.",
    452       "year": 2023,
    453       "relevance": "Directly comparable work on LLM-based data analysis code generation; positions mergen within the landscape"
    454     },
    455     {
    456       "title": "Bioinfo-Bench: A Simple Benchmark Framework for LLM Bioinformatics Skills Evaluation",
    457       "authors": "Chen & Deng",
    458       "year": 2023,
    459       "relevance": "Bioinformatics-specific LLM benchmark; related evaluation framework in same domain"
    460     },
    461     {
    462       "title": "LLM-in-the-loop: Leveraging Large Language Model for Thematic Analysis",
    463       "authors": "Dai et al.",
    464       "year": 2023,
    465       "relevance": "Interactive LLM integration approach; comparable to mergen's human-in-the-loop design"
    466     },
    467     {
    468       "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models",
    469       "authors": "Touvron et al.",
    470       "year": 2023,
    471       "relevance": "Open-source LLM alternative to OpenAI models; mentioned as potential backbone for mergen"
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 2,
    477       "justification": "mergen is released on CRAN and directly usable, but limited to simple/moderate tasks due to high failure rate on complex analyses"
    478     },
    479     "surprise_contrarian": {
    480       "score": 1,
    481       "justification": "Negative result on prompt engineering (CoT, ActAs) is somewhat surprising but not deeply contrarian; the finding aligns with known limitations of LLM reasoning"
    482     },
    483     "fear_safety": {
    484       "score": 0,
    485       "justification": "No discussion of AI risks, misuse, or safety considerations"
    486     },
    487     "drama_conflict": {
    488       "score": 0,
    489       "justification": "Straightforward technical tool paper; no controversy or conflict angle"
    490     },
    491     "demo_ability": {
    492       "score": 3,
    493       "justification": "Package fully functional and available on CRAN; anyone with R and an OpenAI API key can immediately try mergen"
    494     },
    495     "brand_recognition": {
    496       "score": 1,
    497       "justification": "Max Delbrück Center for Molecular Medicine is a reputable German research institute but not a top-tier AI lab; limited brand cachet in LLM community"
    498     }
    499   },
    500   "hn_data": {
    501     "threads": [],
    502     "top_points": 0,
    503     "total_points": 0,
    504     "total_comments": 0
    505   }
    506 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs