scan.json (19382B)
1 { 2 "paper": { 3 "title": "CatDB: Data-catalog-guided, LLM-based Generation of Data-centric ML Pipelines", 4 "authors": ["Saeed Fathollahzadeh", "Essam Mansour", "Matthias Boehm"], 5 "year": 2025, 6 "venue": "PVLDB", 7 "doi": "10.14778/3742728.3742754" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "GitHub repository explicitly provided: https://github.com/CoDS-GCS/CatDB.git, listed under 'PVLDB Artifact Availability'." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The artifact availability statement says 'source code, data, and/or other artifacts have been made available' at the GitHub link. The 20 datasets used are described in Table 3 and appear to be real-world datasets." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "Section 5.1 mentions Ubuntu 22.04, OpenJDK 11, Python 3.10, and hardware specs, but no requirements.txt, Dockerfile, or detailed dependency listing is provided in the paper." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but the paper itself contains no README-level guidance for reproducing experiments." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Results in Tables 5 and 7 are point estimates. Figure 11 shows box plots for 10 iterations but no confidence intervals or error bars with explicit values are reported in tables." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims CatDB achieves 'comparable or better' accuracy than baselines but no statistical significance tests are used to support these comparative claims." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports percentage improvements with baseline context throughout, e.g., 'up to 52% gains' (Section 5.3), '8x to 14x speedup' (Section 5.4), and absolute accuracy values for all systems enabling comparison." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification for why 20 datasets were chosen, why 10 iterations were used for the repeated experiments, or any power analysis." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Figure 11 shows box plots across 10 iterations for three datasets, depicting variance across runs. The success ratios are also annotated on the plots." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Extensive baselines: LLM-based (CAAFE, AIDE, AutoGen), AutoML tools (AutoGluon, H2O, FLAML, Auto-Sklearn), and AutoML with preprocessing workflows (SAGA, L2C + AutoML)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include AIDE (2024), AutoGen (2024), CAAFE (2023), H2O, FLAML, AutoGluon — all recent and state-of-the-art systems." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Section 5.2 studies metadata impact with different configurations (Table 1, Figure 10). Section 5.3 compares original vs refined data catalog. CatDB vs CatDB Chain is also an ablation of the chaining strategy." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "AUC for binary classification, AUC-ovr for multiclass, R² for regression. Token cost and runtime are also reported as secondary metrics." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "The system generates ML pipelines evaluated by automated metrics (accuracy, AUC, R²). Human evaluation of generated code quality is not clearly relevant to the claims." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "Section 5.1: 'We divided all datasets into 70/30 train and test sets.' Tables report both train and test accuracy separately." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down per dataset (20 datasets), per LLM (GPT-4o, Gemini-1.5, Llama3.1-70b), and per task type (binary, multiclass, regression)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 4.2 discusses error types and frequencies (Figure 8). Tables show N/A and failure cases. Section 5.4 discusses where baselines fail. Section 4.3 discusses system limitations." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 5.2 notes 'more metadata does not always improve pipeline quality.' CatDB underperforms some baselines on certain datasets (e.g., NYC regression in Table 7). Yelp shows increased runtime due to one-hot encoding." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims of 'accuracy comparable to or better than existing LLM-based systems, standalone AutoML tools' and 'orders of magnitude faster performance' are supported by Tables 5-8 and Figures 11-14." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims about metadata improving pipeline quality are supported by ablation-style experiments in Section 5.2 (varying metadata configurations) and Section 5.3 (original vs refined catalog)." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "Section 4.3 explicitly states limitations: only supervised learning, basic cleaning tasks, no library constraints. The scope is bounded to tabular data with specific dataset characteristics described in Table 3." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No discussion of alternative explanations for observed results. For example, whether the improvements come from the data catalog approach specifically or from better prompt engineering generally is not disentangled." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper uses 'GPT-4o', 'Gemini-1.5-pro', and 'Llama3.1-70b' but does not specify snapshot dates or API versions for GPT-4o or Gemini-1.5-pro." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Figure 3 shows a complete example prompt with actual content. Figures 6(a) and 6(b) show prompt templates. The system rules and catalog data structure are documented in detail." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section 5.4 states 'LLM temperature set to zero.' The maximum error correction attempts (α₂) parameter is discussed in Algorithm 4." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The full pipeline scaffolding is described: data profiling (Algorithm 1), metadata/rules extraction (Algorithm 2), prompt construction (Algorithm 3), pipeline generation with error handling (Algorithm 4, Figure 7)." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 5.1 describes dataset preparation: 70/30 split, profiling, metadata storage. Table 3 describes all datasets with characteristics. The data catalog refinement process is detailed in Section 3.2." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 4.3 'System Limitations' provides a dedicated discussion of three specific limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 4.3 identifies specific limitations: only basic data cleaning supported, only supervised learning tasks, no library constraints enforcement. These are specific to this system." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 4.3 explicitly states what is NOT supported: entity resolution, data augmentation from data lakes, unsupervised tasks, time-series and image data, library constraints." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The artifact availability statement indicates data is available at the GitHub repository. Table 3 lists all 20 datasets with characteristics." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Table 3 describes all datasets with number of tables, rows, columns, task types, and classes. The error trace dataset is described in Table 2 with total request counts per LLM." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. Datasets are standard benchmarks and real-world datasets described in Table 3." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The full pipeline from dataset profiling to prompt construction to pipeline generation is documented in Algorithms 1-4 and Sections 3-4." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section found in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations clearly listed: Concordia University and Technische Universität Berlin. No affiliation with evaluated LLM providers." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information disclosed, so independence cannot be assessed." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial disclosure statement found in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper evaluates LLMs (GPT-4o, Gemini-1.5-pro, Llama3.1-70b) on their ability to generate pipelines for datasets but does not state model training cutoff dates. The paper even acknowledges that LLMs may have seen benchmark datasets (Section 1 mentions Titanic)." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Section 1 explicitly discusses that LLMs may have been exposed to well-known datasets during training (e.g., Titanic), which is why CatDB focuses on unseen datasets. This is a core motivation of the paper." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "While the paper motivates the problem of LLMs knowing benchmark datasets, it does not systematically analyze which of the 20 evaluation datasets may have been in the LLMs' training data." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Token consumption is reported extensively in Figures 12 and 13 across all datasets and LLMs. Section 4.1 provides a formal cost analysis (Equations 1-2)." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": true, 279 "justification": "Section 5.1 specifies hardware (Intel Core CPU with 32 vcores, 148 GB RAM). Runtime is reported in Tables 6 and 8, and Figures 12 and 9(a)." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "CatDB generates ML pipelines with accuracy comparable to or better than existing LLM-based systems, AutoML tools, and combined workflows.", 286 "evidence": "Tables 5 and 7 show CatDB achieving top or competitive accuracy across 20 datasets against CAAFE, AIDE, AutoGen, H2O, FLAML, AutoGluon, and Auto-Sklearn.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "CatDB delivers up to orders of magnitude faster pipeline execution on large datasets.", 291 "evidence": "Table 6 shows CatDB execution times of 1-4 seconds vs hundreds of seconds for baselines on most datasets. Table 8 shows CatDB completing all 8 datasets while baselines fail on several.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Data catalog refinement improves pipeline accuracy by up to 52%.", 296 "evidence": "Table 5 compares original vs refined accuracy on 6 datasets, with EU IT improving from 39.2% to 91.8% test accuracy (Section 5.3).", 297 "supported": "strong" 298 }, 299 { 300 "claim": "CatDB is robust to data quality issues (outliers and missing values) compared to AutoML tools.", 301 "evidence": "Figure 14 shows CatDB maintaining performance under increasing outlier and missing value injection while AutoML tools deteriorate.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "More metadata does not always improve pipeline quality.", 306 "evidence": "Section 5.2 and Figure 10 show that simple schema metadata sometimes matches or outperforms more detailed metadata configurations.", 307 "supported": "moderate" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "CatDB leverages data catalog information to guide LLMs in generating data-centric ML pipelines, achieving accuracy comparable to or better than state-of-the-art LLM-based systems (CAAFE, AIDE, AutoGen) and AutoML tools (H2O, FLAML, AutoGluon) across 20 diverse datasets. The system's data catalog refinement step can improve accuracy by up to 52% on datasets with data quality issues. CatDB demonstrates orders-of-magnitude faster pipeline execution times compared to baselines on large datasets, with zero failures across all 20 datasets and 3 LLMs. The chain prompting variant (CatDB Chain) effectively handles large datasets that exceed LLM context limits.", 312 "red_flags": [ 313 { 314 "flag": "No statistical significance testing", 315 "detail": "Comparative claims of 'better' or 'comparable' performance are based on raw number comparisons without any statistical tests, despite the acknowledged variance in LLM outputs." 316 }, 317 { 318 "flag": "Unspecified model versions", 319 "detail": "GPT-4o, Gemini-1.5-pro are used without snapshot dates or API version identifiers. LLM behavior can change across versions, making results potentially non-reproducible." 320 }, 321 { 322 "flag": "Selective dataset contamination discussion", 323 "detail": "The paper acknowledges LLMs may know benchmark datasets (Titanic example) but does not analyze which of the 20 evaluation datasets may be in the training data of GPT-4o, Gemini, or Llama." 324 } 325 ], 326 "cited_papers": [ 327 { 328 "title": "Large Language Models for Automated Data Science: Introducing CAAFE for Context-Aware Automated Feature Engineering", 329 "authors": ["Noah Hollmann", "Samuel Müller", "Frank Hutter"], 330 "year": 2023, 331 "relevance": "LLM-based feature engineering baseline directly compared to CatDB." 332 }, 333 { 334 "title": "AIDE: Human-Level Performance in Data Science Competitions", 335 "authors": ["Dominik Schmidt", "Yuxiang Wu", "Zhengyao Jiang"], 336 "year": 2024, 337 "relevance": "End-to-end LLM-based ML solution generator used as a primary baseline." 338 }, 339 { 340 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 341 "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"], 342 "year": 2024, 343 "relevance": "Multi-agent LLM framework used as baseline for ML pipeline generation." 344 }, 345 { 346 "title": "Can Foundation Models Wrangle Your Data?", 347 "authors": ["Avanika Narayan", "Ines Chami", "Laurel J. Orr", "Christopher Ré"], 348 "year": 2022, 349 "relevance": "Studies LLM capabilities for data wrangling tasks, directly relevant to LLM-based data processing." 350 }, 351 { 352 "title": "Calibrated Language Models Must Hallucinate", 353 "authors": ["Adam Tauman Kalai", "Santosh S. Vempala"], 354 "year": 2023, 355 "arxiv_id": "2311.14648", 356 "relevance": "Theoretical work on LLM hallucination that motivates CatDB's error handling approach." 357 }, 358 { 359 "title": "FLAML: A Fast and Lightweight AutoML Library", 360 "authors": ["Chi Wang", "Qingyun Wu", "Markus Weimer", "Erkang Zhu"], 361 "year": 2021, 362 "relevance": "AutoML baseline system compared against CatDB." 363 }, 364 { 365 "title": "AutoGluon-Tabular: Robust and Accurate AutoML for Structured Data", 366 "authors": ["Nick Erickson", "Jonas Mueller"], 367 "year": 2020, 368 "arxiv_id": "2003.06505", 369 "relevance": "AutoML baseline for tabular data compared against CatDB." 370 }, 371 { 372 "title": "SAGA: A Scalable Framework for Optimizing Data Cleaning Pipelines for Machine Learning Applications", 373 "authors": ["Shafaq Siddiqi", "Roman Kern", "Matthias Boehm"], 374 "year": 2023, 375 "relevance": "Data cleaning pipeline framework used in AutoML workflow baselines." 376 } 377 ] 378 }