scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23515B)
      1 {
      2   "paper": {
      3     "title": "Large Language Models for Constructing and Optimizing Machine Learning Workflows: A Survey",
      4     "authors": [
      5       "Yang Gu",
      6       "Hengyu You",
      7       "Jian Cao",
      8       "Muran Yu",
      9       "Haoran Fan",
     10       "Shiyou Qian"
     11     ],
     12     "year": 2024,
     13     "venue": "ACM Transactions on Software Engineering and Methodology",
     14     "arxiv_id": "2411.10478",
     15     "doi": "10.1145/3773084"
     16   },
     17   "scan_version": 2,
     18   "active_modules": ["survey_methodology"],
     19   "methodology_tags": ["meta-analysis"],
     20   "key_findings": "This survey provides a taxonomy of LLM applications across five ML workflow stages: data preprocessing, feature engineering, model selection, hyperparameter optimization, and workflow evaluation. It categorizes LLM-assisted model selection into retrieval-based and generation-based approaches, and HPO into execution-based and prediction-based methods. The paper identifies six open challenges (data leakage, prompt engineering complexity, hallucination, interpretability, resource consumption, social impact) and proposes end-to-end workflow construction and hybrid LLM-specialized model integration as future directions.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper provides a GitHub repository: 'visit our repository: https://github.com/t-harden/LLM4AutoML' in the footnote on page 1. This is a curated list of papers, not analysis code, but it is a released artifact."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No structured dataset or search corpus is released. The GitHub repo is a curated reading list, not the underlying survey data (search queries, inclusion/exclusion decisions, extraction sheets)."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No environment or dependency specifications are provided. The survey has no computational pipeline to reproduce."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No instructions for reproducing the survey methodology (search strategy, screening process, data extraction) are provided."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": false,
     47         "answer": false,
     48         "justification": "Survey paper with no experiments or quantitative analysis of its own."
     49       },
     50       "significance_tests": {
     51         "applies": false,
     52         "answer": false,
     53         "justification": "Survey paper with no experiments or statistical comparisons."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": false,
     57         "answer": false,
     58         "justification": "Survey paper with no experiments."
     59       },
     60       "sample_size_justified": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "Survey paper with no experiments."
     64       },
     65       "variance_reported": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "Survey paper with no experiments."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The paper positions itself against Tornede et al. [2023a]: 'it is the first survey to systematically address every stage of the ML workflow (Fig. 1), distinguishing it from previous reviews, such as Tornede et al. [2023a], which primarily explore the broader opportunities of an integration of LLMs and AutoML' (Section 1)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The comparison baseline (Tornede et al. 2023) is contemporary and relevant."
     81       },
     82       "ablation_study": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "Survey paper — no system with components to ablate."
     86       },
     87       "multiple_metrics": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "Survey paper — no experiments with metrics."
     91       },
     92       "human_evaluation": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "Survey paper — no system outputs to evaluate."
     96       },
     97       "held_out_test_set": {
     98         "applies": false,
     99         "answer": false,
    100         "justification": "Survey paper — no experiments."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper organizes reviewed work by workflow stage (Sections 3-5) and provides Table 1 with a per-method breakdown across five workflow components (data preprocessing, feature engineering, model selection, HPO, evaluation)."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Each subsection discusses limitations and failure modes of the reviewed approaches. Section 6 dedicates substantial discussion to open challenges including hallucination, data leakage, prompt engineering difficulties, and resource consumption."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper consistently discusses what doesn't work: LLM misinterpretation of data structures (Section 3.1.2), biased feature selection (Section 3.2.1), hallucination in feature synthesis (Section 3.2.3), stochastic model selection (Section 4.1.2), and inaccurate performance predictions (Section 4.2.2)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims a 'comprehensive and up-to-date review' covering 'data and feature engineering, model selection and hyperparameter optimization, and workflow evaluation.' The paper delivers on this structure across Sections 3-5 with Table 1 summarizing 22 methods."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper makes claims like LLMs 'streamline and enhance ML workflow modeling process' (abstract) and LLMs 'have shown great potential for automating and enhancing various stages' without assessing the quality of evidence from the papers making these claims. The survey does not evaluate whether the improvements reported in cited papers are methodologically sound."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper states 'our focus is on capturing the breadth and possibility of research within the specific context of constructing and optimizing ML workflows, rather than providing an exhaustive examination of all LLM-related methods at each individual stage' (Section 1). The scope is clearly bounded to ML workflow construction."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The survey does not consider alternative explanations for the positive results reported in the literature, such as publication bias toward positive results, overfitting to benchmarks, or whether improvements are due to factors other than LLMs. Section 6 discusses challenges of LLMs but not whether the claimed benefits are real."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "Pure survey with no measurements of its own."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": false,
    148         "answer": false,
    149         "justification": "Survey paper — no models used by the authors."
    150       },
    151       "prompts_provided": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "Survey paper — no prompting used by the authors."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": false,
    158         "answer": false,
    159         "justification": "Survey paper — no experiments conducted."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "Survey paper — no agentic scaffolding used."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No paper selection pipeline is described. There is no mention of which databases were searched, what search queries were used, what inclusion/exclusion criteria were applied, or how many papers were screened at each stage. The survey does not explain how the 22 methods in Table 1 were identified."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 6 'Open Challenges and Future Directions' contains substantial discussion across six subsections (data leakage, prompt engineering, hallucination, interpretability, resource consumption, social impact)."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "Section 6 discusses challenges and limitations of LLM-driven ML workflows in general, but does not discuss threats to the validity of the survey itself — e.g., selection bias in paper collection, coverage gaps, or limitations of the non-systematic review methodology."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The paper explicitly states: 'our focus is on capturing the breadth and possibility of research within the specific context of constructing and optimizing ML workflows, rather than providing an exhaustive examination of all LLM-related methods at each individual stage' (Section 1)."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw survey data (search results, screening decisions, extraction sheets) is available. The GitHub repo is a curated reading list, not the survey methodology artifacts."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "The paper does not describe how the surveyed papers were identified. No databases, search queries, time periods, or inclusion criteria are specified. The methodology for collecting the 22 methods in Table 1 is entirely opaque."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. The paper selection process (analogous to 'recruitment' for a survey) is not described, but this question is about human participant recruitment specifically."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No data pipeline from paper collection to final survey is documented. There is no PRISMA-like flow diagram or description of screening stages."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding information or acknowledgments section is present in the paper text."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly stated: Shanghai Jiao Tong University (Gu, You, Cao, Fan, Qian) and Stanford University (Yu)."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding is disclosed, so independence cannot be assessed."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial disclosure statement is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "Survey paper — does not evaluate any pre-trained model on a benchmark."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "Survey paper — does not evaluate any pre-trained model on a benchmark."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "Survey paper — does not evaluate any pre-trained model on a benchmark."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this survey paper."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this survey paper."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this survey paper."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this survey paper."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this survey paper."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this survey paper."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this survey paper."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "Survey paper — no method of its own with inference costs."
    292       },
    293       "compute_budget_stated": {
    294         "applies": false,
    295         "answer": false,
    296         "justification": "Survey paper — no computational work performed."
    297       }
    298     },
    299     "survey_methodology": {
    300       "prisma_or_structured_protocol": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No PRISMA flow diagram, no registered protocol, no systematic search strategy. The paper does not describe which databases were searched, what queries were used, or how papers were screened. The selection methodology for Table 1's 22 methods is entirely undocumented."
    304       },
    305       "quality_assessment_of_sources": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The survey does not assess the methodological quality of the papers it reviews. All 22 methods in Table 1 are treated equivalently regardless of their experimental rigor, sample sizes, or validation quality. Claims from reviewed papers are reported at face value."
    309       },
    310       "publication_bias_discussed": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No discussion of publication bias. The survey does not consider whether the literature on LLMs for ML workflows skews toward positive results, or whether negative results (LLMs failing at workflow tasks) are underrepresented."
    314       }
    315     }
    316   },
    317   "claims": [
    318     {
    319       "claim": "This is the first survey to systematically address every stage of the ML workflow for LLM integration, distinguishing it from prior reviews like Tornede et al. (2023).",
    320       "evidence": "Section 1 states: 'To the best of our knowledge, it is the first survey to systematically address every stage of the ML workflow (Fig. 1), distinguishing it from previous reviews, such as Tornede et al. [2023a], which primarily explore the broader opportunities of an integration of LLMs and AutoML.'",
    321       "supported": "moderate"
    322     },
    323     {
    324       "claim": "LLMs have shown great potential for automating and enhancing various stages of the ML pipeline through language understanding, reasoning, interaction, and generation.",
    325       "evidence": "The paper reviews 22 methods across five workflow stages (Table 1), citing specific systems like CAAFE for feature engineering, HuggingGPT for model selection, LLAMBO for Bayesian optimization, and VML for workflow evaluation. However, results from these systems are reported as described by their authors without independent validation.",
    326       "supported": "moderate"
    327     },
    328     {
    329       "claim": "LLM-driven ML workflows face significant challenges including hallucination, data leakage, complicated prompt engineering, interpretability issues, high resource consumption, and social impact concerns.",
    330       "evidence": "Section 6 provides detailed discussion of each challenge with citations to specific systems exhibiting these issues (e.g., MLCopilot for hallucination, LLAMBO for resource consumption, CAAFE for hallucinated features).",
    331       "supported": "moderate"
    332     },
    333     {
    334       "claim": "End-to-end ML workflow construction using LLMs is a promising future direction where LLMs autonomously handle every pipeline stage.",
    335       "evidence": "Section 6.2 discusses this direction citing AIDE, ResearchAgent, CodeActAgent, and DS-Agent as existing implicit stage-wise solutions, but acknowledges these 'often lack transparency' and notes explicit end-to-end generation is not yet achieved.",
    336       "supported": "weak"
    337     }
    338   ],
    339   "red_flags": [
    340     {
    341       "flag": "No systematic search methodology",
    342       "detail": "The survey does not describe how papers were identified, which databases were searched, what search queries were used, or what inclusion/exclusion criteria were applied. The 22 methods in Table 1 appear to be an ad-hoc collection without a reproducible selection process."
    343     },
    344     {
    345       "flag": "No quality assessment of reviewed papers",
    346       "detail": "All reviewed papers are treated equally regardless of their methodological rigor. Claims from reviewed papers (e.g., that LLMs improve workflow efficiency) are reported at face value without assessing whether the underlying experiments are well-designed, properly controlled, or reproducible. This effectively launders the signal-to-noise ratio of the sources."
    347     },
    348     {
    349       "flag": "No publication bias discussion",
    350       "detail": "The survey does not consider whether the literature on LLMs for ML workflows is biased toward positive results. Papers showing LLMs failing at workflow tasks are likely underrepresented, which would systematically overstate the potential of LLM-driven workflows."
    351     },
    352     {
    353       "flag": "Overclaiming from limited evidence base",
    354       "detail": "The paper claims to provide a 'comprehensive' review but covers only 22 methods. Several workflow stages have very few entries (e.g., workflow evaluation has only 4 systems discussed). The comprehensiveness claim is not supported by evidence of systematic coverage."
    355     }
    356   ],
    357   "cited_papers": [
    358     {
    359       "title": "MLAgentBench: Evaluating Language Agents on Machine Learning Experimentation",
    360       "authors": ["Qian Huang", "Jian Vora", "Percy Liang", "Jure Leskovec"],
    361       "year": 2024,
    362       "relevance": "Benchmark for evaluating LLM agents on ML experimentation tasks, directly relevant to agentic AI evaluation methodology."
    363     },
    364     {
    365       "title": "Mle-bench: Evaluating machine learning agents on machine learning engineering",
    366       "authors": ["Jun Shern Chan", "Neil Chowdhury", "Oliver Jaffe"],
    367       "year": 2024,
    368       "arxiv_id": "2410.07095",
    369       "relevance": "Benchmark for evaluating ML engineering capabilities of LLM agents."
    370     },
    371     {
    372       "title": "GPT-4 Technical Report",
    373       "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"],
    374       "year": 2023,
    375       "arxiv_id": "2303.08774",
    376       "relevance": "Foundation model used in many of the surveyed ML workflow systems."
    377     },
    378     {
    379       "title": "HuggingGPT: Solving AI tasks with ChatGPT and its friends in Hugging Face",
    380       "authors": ["Yongliang Shen", "Kaitao Song", "Xu Tan"],
    381       "year": 2024,
    382       "relevance": "LLM-based agent for AI task solving through model selection and orchestration, key example of agentic AI workflow."
    383     },
    384     {
    385       "title": "Large language models for automated data science: Introducing CAAFE for context-aware automated feature engineering",
    386       "authors": ["Noah Hollmann", "Samuel Müller", "Frank Hutter"],
    387       "year": 2024,
    388       "relevance": "LLM-driven automated feature engineering system demonstrating LLM capabilities in ML workflow automation."
    389     },
    390     {
    391       "title": "DS-Agent: Automated Data Science by Empowering Large Language Models with Case-Based Reasoning",
    392       "authors": ["Siyuan Guo", "Cheng Deng", "Ying Wen"],
    393       "year": 2024,
    394       "arxiv_id": "2402.17453",
    395       "relevance": "Agentic LLM system for automated data science with case-based reasoning."
    396     },
    397     {
    398       "title": "AutoML-Agent: A Multi-Agent LLM Framework for Full-Pipeline AutoML",
    399       "authors": ["Patara Trirat", "Wonyong Jeong", "Sung Ju Hwang"],
    400       "year": 2024,
    401       "arxiv_id": "2410.02958",
    402       "relevance": "Multi-agent LLM framework covering full AutoML pipeline, directly relevant to agentic AI workflow construction."
    403     },
    404     {
    405       "title": "Verbalized Machine Learning: Revisiting Machine Learning with Language Models",
    406       "authors": ["Tim Z Xiao", "Robert Bamler", "Bernhard Schölkopf", "Weiyang Liu"],
    407       "year": 2024,
    408       "arxiv_id": "2406.04344",
    409       "relevance": "Novel approach using LLM prompts as ML model parameters, relevant to LLM capability evaluation."
    410     },
    411     {
    412       "title": "Large Language Models to Enhance Bayesian Optimization",
    413       "authors": ["Tennison Liu", "Nicolás Astorga", "Nabeel Seedat", "Mihaela van der Schaar"],
    414       "year": 2024,
    415       "arxiv_id": "2402.03921",
    416       "relevance": "LLM-enhanced Bayesian optimization (LLAMBO) for hyperparameter tuning, demonstrating LLM integration into optimization."
    417     },
    418     {
    419       "title": "Chain of thought prompting elicits reasoning in large language models",
    420       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    421       "year": 2022,
    422       "arxiv_id": "2201.11903",
    423       "relevance": "Foundational prompting technique used in many LLM-based ML workflow systems."
    424     },
    425     {
    426       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    427       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    428       "year": 2024,
    429       "arxiv_id": "2407.16741",
    430       "relevance": "Open-source platform for AI coding agents, relevant to agentic software development workflows."
    431     },
    432     {
    433       "title": "AutoML in the age of large language models: Current challenges, future opportunities and risks",
    434       "authors": ["Alexander Tornede", "Difan Deng", "Theresa Eimer"],
    435       "year": 2023,
    436       "arxiv_id": "2306.08107",
    437       "relevance": "Prior survey on LLM-AutoML integration that this paper positions against, directly relevant to survey methodology comparison."
    438     },
    439     {
    440       "title": "Github copilot AI pair programmer: Asset or liability?",
    441       "authors": ["Arghavan Moradi Dakhel", "Vahid Majdinasab", "Amin Nikanjam"],
    442       "year": 2023,
    443       "doi": "10.1016/j.jss.2023.111734",
    444       "relevance": "Evaluation of GitHub Copilot for programming, relevant to LLM code generation productivity assessment."
    445     }
    446   ]
    447 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs