scan-v5.json (19371B)
1 { 2 "scan_version": 5, 3 "paper_type": "survey", 4 "paper": { 5 "title": "Large Language Models for Constructing and Optimizing Machine Learning Workflows: A Survey", 6 "authors": [ 7 "Yang Gu", 8 "Hengyu You", 9 "Jian Cao", 10 "Muran Yu", 11 "Haoran Fan" 12 ], 13 "year": 2024, 14 "venue": "ACM Transactions on Software Engineering and Methodology", 15 "arxiv_id": "2411.10478", 16 "doi": "10.1145/3773084" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "The abstract claims a comprehensive review covering data/feature engineering, model selection/HPO, and workflow evaluation — all of which are delivered in Sections 3–5 with multiple systems reviewed per stage.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The survey asserts LLMs 'revolutionized AI and ML' and 'demonstrate significant potential for automating and enhancing various stages' without systematic comparison to baselines or aggregated evidence; these causal claims are unsupported by the survey's own analysis.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "Conclusions about LLM benefits are stated broadly across all ML workflows and domains without bounding them to specific task types, scales, or conditions — the survey treats cherry-picked papers as representative.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The survey acknowledges limitations like hallucination and data leakage but never considers whether observed performance gains in reviewed papers might be due to dataset contamination, cherry-picked benchmarks, or lack of proper baselines.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": false, 47 "justification": "Reviewed papers use metrics like accuracy and loss to claim ML workflow improvements, but the survey never questions whether these proxy metrics capture real-world practitioner needs or workflow quality.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section 6 'Open Challenges and Future Directions' dedicates six subsections to challenges including data leakage, prompt engineering, hallucination, interpretability, and resource consumption.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "Threats discussed concern limitations of the reviewed systems (e.g., LLM hallucinations in CAAFE), not threats to the survey's own validity such as search completeness, selection bias, or quality of reviewed evidence.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "The introduction explicitly states focus on three ML workflow stages and distinguishes scope from Tornede et al. [2023a]; the paper also states it captures 'breadth and possibility' rather than exhaustive coverage of every LLM method.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding source is disclosed anywhere in the paper; only institutional affiliations (SJTU, Stanford) appear in the author block.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are clearly listed: Shanghai Jiao Tong University and Stanford University.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funding is disclosed, so independence cannot be assessed.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement, patent disclosures, or financial interest declarations appear in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Section 2 defines 'ML workflow,' 'task specification,' and provides explicit background on LLMs; all major sub-components (data preprocessing, feature engineering, model selection, HPO) are defined at section introductions.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "The introduction explicitly claims this is 'the first survey to systematically address every stage of the ML workflow' and aims to provide researchers and practitioners a 'comprehensive understanding of the strengths and limitations of LLMs' in this context.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper compares itself to Tornede et al. [2023a] and explains how it differs; Table 1 situates reviewed methods relative to workflow stages, providing structural differentiation from prior reviews.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "survey": { 120 "search_and_selection": { 121 "search_strategy_reproducible": { 122 "applies": true, 123 "answer": false, 124 "justification": "No search methodology is described; Table 1 lists 23 systems but provides no explanation of how the corpus was assembled — no keywords, no database queries, no systematic procedure.", 125 "source": "haiku" 126 }, 127 "inclusion_exclusion_explicit": { 128 "applies": true, 129 "answer": false, 130 "justification": "No inclusion or exclusion criteria are stated; the paper says it focuses on 'capturing the breadth and possibility of research' without defining what qualifies a paper for inclusion.", 131 "source": "haiku" 132 }, 133 "prisma_or_structured_protocol": { 134 "applies": true, 135 "answer": false, 136 "justification": "No PRISMA flowchart, systematic review protocol, or structured selection framework is mentioned anywhere in the paper.", 137 "source": "haiku" 138 }, 139 "search_terms_provided": { 140 "applies": true, 141 "answer": false, 142 "justification": "No search terms or queries are provided; the paper does not describe any database search process.", 143 "source": "haiku" 144 }, 145 "databases_listed": { 146 "applies": true, 147 "answer": false, 148 "justification": "No literature databases (e.g., ACM DL, IEEE Xplore, arXiv, Semantic Scholar) are listed as search sources.", 149 "source": "haiku" 150 }, 151 "screening_process_documented": { 152 "applies": true, 153 "answer": false, 154 "justification": "There is no documentation of how many papers were initially retrieved, screened, and ultimately included; the 23 methods in Table 1 appear without provenance.", 155 "source": "haiku" 156 }, 157 "review_scope_justified": { 158 "applies": true, 159 "answer": false, 160 "justification": "The focus on ML workflows is stated but not justified; no rationale is given for which years, venues, or subtopics are in scope, nor why certain adjacent areas are excluded.", 161 "source": "haiku" 162 } 163 }, 164 "synthesis_quality": { 165 "conflicting_findings_acknowledged": { 166 "applies": true, 167 "answer": false, 168 "justification": "The paper presents execution-based vs prediction-based HPO tradeoffs, but does not systematically surface or reconcile conflicting empirical findings across reviewed papers — papers are described individually without cross-paper comparison.", 169 "source": "haiku" 170 }, 171 "quality_assessment_of_sources": { 172 "applies": true, 173 "answer": false, 174 "justification": "No quality rubric, risk-of-bias assessment, or structured evaluation is applied to reviewed papers; preprints and peer-reviewed papers are treated identically.", 175 "source": "haiku" 176 }, 177 "publication_bias_discussed": { 178 "applies": true, 179 "answer": false, 180 "justification": "Publication bias is never mentioned; the survey reviews only systems claiming positive results without acknowledging that failed LLM-for-AutoML approaches likely go unpublished.", 181 "source": "haiku" 182 }, 183 "quantitative_synthesis_present": { 184 "applies": true, 185 "answer": false, 186 "justification": "The synthesis is entirely narrative; no meta-analysis, vote counting, effect size aggregation, or quantitative comparison across methods is provided.", 187 "source": "haiku" 188 }, 189 "recommendations_supported_by_evidence": { 190 "applies": true, 191 "answer": false, 192 "justification": "Future directions (end-to-end workflow construction, hybrid LLM+specialized model systems) are stated as logical extrapolations without being grounded in quantitative evidence from the reviewed papers.", 193 "source": "haiku" 194 } 195 } 196 } 197 }, 198 "claims": [ 199 { 200 "claim": "This is the first survey to systematically address every stage of the ML workflow when integrating LLMs.", 201 "evidence": "Stated in the introduction, contrasted only against Tornede et al. [2023a] which 'primarily explores broader opportunities'; no systematic literature search conducted to confirm priority.", 202 "supported": "weak" 203 }, 204 { 205 "claim": "LLMs can streamline and enhance ML workflow construction through language understanding, reasoning, interaction, and generation.", 206 "evidence": "23 systems reviewed (Table 1) demonstrate LLM applications at various workflow stages, but no comparative benchmarks or aggregated success rates are provided.", 207 "supported": "moderate" 208 }, 209 { 210 "claim": "Prediction-based HPO using LLMs is faster but less accurate than execution-based HPO.", 211 "evidence": "Contrasted via AgentHPO and LLAMBO descriptions; no empirical comparison table or statistical evidence provided — relies on authors' characterization of individual papers.", 212 "supported": "weak" 213 }, 214 { 215 "claim": "Data leakage is a major concern when integrating LLMs into ML workflows because LLMs are pretrained on common ML benchmark datasets.", 216 "evidence": "Section 6.1.1 provides logical argument with citations (Zhang et al. [2023b], Hollmann et al. [2024]); empirically documented in some reviewed papers but not systematically measured.", 217 "supported": "moderate" 218 }, 219 { 220 "claim": "LLM hallucinations pose risks to ML workflow integrity, producing plausible but incorrect models, hyperparameters, and features.", 221 "evidence": "Section 6.1.3 gives specific examples from CAAFE (features not grounded in data), MLCopilot (incorrect model recommendations), and VML (numerical errors); well-documented in reviewed literature.", 222 "supported": "strong" 223 }, 224 { 225 "claim": "LLM-driven workflow evaluation can significantly reduce time and computational resources compared to full model training.", 226 "evidence": "AutoML-GPT and VML cited as examples; no quantitative comparison of resource savings vs. traditional evaluation is provided.", 227 "supported": "weak" 228 } 229 ], 230 "methodology_tags": [ 231 "survey" 232 ], 233 "key_findings": "This narrative survey reviews 23 LLM-assisted ML workflow systems across data/feature engineering, model selection, hyperparameter optimization, and workflow evaluation, organized into a taxonomy of retrieval-based vs. generation-based and execution-based vs. prediction-based approaches. Key identified challenges are data leakage from LLM pretraining on benchmark datasets, hallucination producing plausible but incorrect configurations, prompt engineering complexity, and substantial resource consumption. The survey contains no systematic search methodology, quality assessment of reviewed papers, or quantitative synthesis — it is an unsystematic narrative overview that treats all reviewed systems equally regardless of empirical rigor. Future directions proposed (end-to-end autonomous workflow construction, hybrid LLM+specialized models) lack grounding in aggregated evidence.", 234 "red_flags": [ 235 { 236 "flag": "No search methodology", 237 "detail": "23 reviewed systems appear in Table 1 with no documented search strategy, databases queried, keywords used, or screening process — the corpus selection is opaque and unreproducible." 238 }, 239 { 240 "flag": "No quality assessment of sources", 241 "detail": "Preprints and peer-reviewed papers are treated identically; systems are described at face value with no risk-of-bias evaluation or assessment of empirical rigor of the reviewed work." 242 }, 243 { 244 "flag": "Publication bias ignored", 245 "detail": "All 23 reviewed systems show positive results; no acknowledgment that failed LLM-for-AutoML systems are systematically underrepresented." 246 }, 247 { 248 "flag": "Priority claim unverifiable", 249 "detail": "Claims to be 'the first survey to systematically address every stage of the ML workflow' but no systematic literature search was conducted to substantiate this; the claim is unverifiable." 250 }, 251 { 252 "flag": "No funding disclosure", 253 "detail": "No funding source is mentioned anywhere in the paper despite being under review at a major venue." 254 }, 255 { 256 "flag": "Purely narrative synthesis", 257 "detail": "No quantitative comparison across reviewed systems — performance metrics, success rates, and resource costs are described per-paper without aggregation, making cross-method evaluation impossible." 258 } 259 ], 260 "cited_papers": [ 261 { 262 "title": "AutoML in the Age of Large Language Models: Current Challenges, Future Opportunities and Risks", 263 "relevance": "Direct predecessor survey that this paper explicitly positions against; covers broader LLM-AutoML opportunities without per-stage workflow analysis." 264 }, 265 { 266 "title": "MLE-Bench: Evaluating Machine Learning Agents on Machine Learning Engineering", 267 "relevance": "Benchmark for evaluating LLM agents on ML engineering tasks — directly relevant to assessing the effectiveness of surveyed systems." 268 }, 269 { 270 "title": "MLAgentBench: Evaluating Language Agents on Machine Learning Experimentation", 271 "relevance": "Evaluation framework for LLM-based ML agents; cited as example of benchmark used to assess LLM workflow capabilities." 272 }, 273 { 274 "title": "Large Language Models for Automated Data Science: Introducing CAAFE for Context-Aware Automated Feature Engineering", 275 "relevance": "Most frequently cited reviewed system; demonstrates LLM-driven feature synthesis with iterative generation and explanations." 276 }, 277 { 278 "title": "MLCopilot: Unleashing the Power of Large Language Models in Solving Machine Learning Tasks", 279 "relevance": "Knowledge-based LLM system for ML task solving via historical experience retrieval; central example of retrieval-based model selection." 280 }, 281 { 282 "title": "Large Language Models to Enhance Bayesian Optimization (LLAMBO)", 283 "relevance": "LLM-enhanced Bayesian optimization for HPO; key example of prediction-based HPO and resource efficiency tradeoffs discussed." 284 }, 285 { 286 "title": "Large Language Model Agent for Hyper-Parameter Optimization (AgentHPO)", 287 "relevance": "Two-agent LLM framework for HPO; primary example of prediction-based HPO approach contrasted against execution-based methods." 288 }, 289 { 290 "title": "AutoMMLab: Automatically Generating Deployable Models from Language Instructions for Computer Vision Tasks", 291 "relevance": "End-to-end LLM-driven ML workflow system covering dataset recommendation, model selection, and HPO; extensively cited across all survey sections." 292 }, 293 { 294 "title": "AutoML-Agent: A Multi-Agent LLM Framework for Full-Pipeline AutoML", 295 "relevance": "Multi-agent framework spanning all ML workflow stages; most comprehensive reviewed system and counterpoint to single-agent approaches." 296 }, 297 { 298 "title": "Verbalized Machine Learning: Revisiting Machine Learning with Language Models", 299 "relevance": "Framing ML model parameters as natural language prompts; novel generation-based model selection approach discussed in multiple sections." 300 } 301 ], 302 "engagement_factors": { 303 "practical_relevance": { 304 "score": 3, 305 "justification": "Directly addresses ML practitioners' pain points in workflow automation with actionable taxonomy of existing tools across all workflow stages." 306 }, 307 "surprise_contrarian": { 308 "score": 1, 309 "justification": "Mostly confirms conventional wisdom that LLMs can help automate ML tasks; challenges are acknowledged but the overall framing is optimistic and expected." 310 }, 311 "fear_safety": { 312 "score": 1, 313 "justification": "Mentions data leakage, hallucination, and displacement of ML workers but frames these as engineering challenges rather than existential risks." 314 }, 315 "drama_conflict": { 316 "score": 0, 317 "justification": "No controversy, competing camps, or dramatic empirical reversals; purely a constructive organizational survey." 318 }, 319 "demo_ability": { 320 "score": 2, 321 "justification": "Several reviewed systems are publicly available (CAAFE, Aliro, AutoML-Agent) and the authors maintain a GitHub repository at github.com/t-harden/LLM4AutoML." 322 }, 323 "brand_recognition": { 324 "score": 1, 325 "justification": "Published in ACM TOSEM from SJTU; extensively discusses GPT-4, LLaMA, and other recognizable models but the survey itself is not from a top-tier AI lab." 326 } 327 }, 328 "hn_data": { 329 "threads": [ 330 { 331 "hn_id": "39582552", 332 "title": "Separating a particle's mass from its momentum", 333 "points": 53, 334 "comments": 17, 335 "url": "https://news.ycombinator.com/item?id=39582552", 336 "created_at": "2024-03-03T17:46:20Z" 337 }, 338 { 339 "hn_id": "39149306", 340 "title": "Dynamic Programming: Finite States Thomas J. Sargent, John Stachurski", 341 "points": 2, 342 "comments": 1, 343 "url": "https://news.ycombinator.com/item?id=39149306", 344 "created_at": "2024-01-26T22:09:08Z" 345 }, 346 { 347 "hn_id": "38403066", 348 "title": "Is Mathematics a Game?", 349 "points": 2, 350 "comments": 0, 351 "url": "https://news.ycombinator.com/item?id=38403066", 352 "created_at": "2023-11-24T11:58:18Z" 353 } 354 ], 355 "top_points": 53, 356 "total_points": 57, 357 "total_comments": 18 358 } 359 }