scan.json (25243B)
1 { 2 "paper": { 3 "title": "AutoStreamPipe: LLM Assisted Automatic Generation of Data Stream Processing Pipelines", 4 "authors": [ 5 "Abolfazl Younesi", 6 "Zahra Najafabadi Samani", 7 "Thomas Fahringer" 8 ], 9 "year": 2025, 10 "venue": "arXiv preprint", 11 "arxiv_id": "2510.23408" 12 }, 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "The paper provides a GitHub repository URL (https://github.com/Anonymous0-0paper/SWG) and describes it as an 'open-source implementation' in the contributions section. However, this appears to be an anonymized repository for review, which may be temporary." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The queries and benchmarks are stated to be available in the GitHub repository. Section 5.1.1 says 'All queries are available in our repository' and 'All benchmarks are available on the AutoStreamPipe GitHub repository.'" 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": true, 28 "justification": "Section 5.1 specifies: Python v3.12.7, LangChain v0.3.11, Apache Flink v1.20.1, Storm v2.8.0, Spark v3.5.5, Intel Core i7-13700K processor with 64GB RAM, Ubuntu 22.04 LTS. This is detailed enough to recreate the environment." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "While the paper describes the system architecture in detail and provides code, there are no explicit step-by-step reproduction instructions, README with commands, or a 'Reproducing Results' section in the paper text." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper reports point estimates for EFS scores (e.g., 0.98, 0.70, 0.59) and average error counts without confidence intervals or error bars. Box plots in Figure 10 show IQR but the main results in Table 4 are point estimates." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper makes comparative claims (e.g., '5.19x error reduction', '50% improvement over CoT Planning') without any statistical significance tests. All comparisons are based on raw number differences." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper reports relative improvements with baseline context: '6.3x reduction in development time', '5.19x error reduction', '50% improvement over CoT Planning', '64% improvement over Base-LLM', '15-20% higher throughput', '30-35% lower latency'. These provide magnitude context." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "The benchmark suite contains 8 applications, each generated 5 times for 3 SPEs (120 total pipeline generations). No justification is provided for why 5 repetitions or 8 applications were chosen, and no power analysis is discussed." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "The paper reports averages (e.g., 'average EFS of 0.98') but does not report standard deviations or variance across the 5 runs per configuration. Box plots in Figure 10 show some spread but the main quantitative results lack variance measures." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Section 5.1.2 defines three baselines: Base-LLM (direct queries without HGoT/planning/resilience), CoT Planning (query analysis + CoT without resilience), and GoT Based (standard GoT without hypergraph). Additionally, NiFi and manual coding are compared for development time (Table 5)." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": false, 72 "justification": "The baselines are ablations of the system itself (Base-LLM, CoT, GoT), not independent prior work. No external competing system for LLM-based pipeline generation is compared. The related work mentions prior tools but none are included as baselines. NiFi is compared only for development time, not for pipeline quality." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "The baselines effectively serve as ablations: Base-LLM (no HGoT, no planning, no resilience), CoT Planning (with query analysis but no hypergraph), GoT Based (standard graph without hyperedges), and AutoStreamPipe (full system). This progressively adds components." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "The paper uses multiple metrics: Error-Free Score (EFS), processing time, response completeness, syntax/logic/runtime error counts, development time, throughput (k events/s), and P99 latency (ms)." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": false, 87 "justification": "No human evaluation of the generated pipeline quality is performed. All evaluation is automated (compilation tests, error counting, timing). Human judgment on code readability, maintainability, or correctness of generated pipelines would have been relevant." 88 }, 89 "held_out_test_set": { 90 "applies": false, 91 "answer": false, 92 "justification": "This is a system evaluation paper, not a training/tuning paper. The pipeline benchmarks are directly evaluated rather than split into train/test. There is no model fine-tuning involved." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down by pipeline complexity (simple/medium/complex), by SPE (Flink/Storm/Spark), by error type (syntax/logic/runtime), and by query type (complete/partial information). Tables 4 and 5 and Figures 7-11 provide detailed breakdowns." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "The paper discusses failure patterns: stateful pipelines produce more errors than stateless ones (Figure 10), complex pipelines with partial information are most challenging (EFS drops to 0.59), and specific error types are analyzed. Figure 11 shows cases that were not fully fixed." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper reports that performance declines for complex pipelines (EFS of 0.59 vs 0.98 for simple), acknowledges 'the reduced performance is expected', and shows that some pipelines are not fully fixed (Figure 11a shows partial fixes). Logic errors in complex pipelines remain high even for AutoStreamPipe." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims '6.3x development time reduction' and '5.19x error rate reduction' are supported by Table 5 (development time) and Table 4 / Section 5.3 (EFS comparisons). The claim of 'good accuracy' is supported by the EFS scores." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper attributes improvements to specific components (HGoT, query analyzer, resilient execution) and supports these through ablation-style comparisons (Base-LLM vs CoT vs GoT vs ASP). Each step adds a component, making the causal attribution reasonable through controlled ablation." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title and abstract claim to 'automate the design, generation, and deployment of stream processing pipelines' generally, but the evaluation covers only 8 specific pipeline types across 3 SPEs with 4 specific LLMs. The paper does not bound its claims to these tested settings. Claims like 'capable of generating a virtually unlimited number of SP pipelines' in Section 1 are not bounded." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper does not discuss alternative explanations for its results. For example, the improvement could be due to the additional LLM calls (more compute) rather than the HGoT structure itself. No threats-to-validity section or discussion of confounds is present." 130 } 131 }, 132 "setup_transparency": { 133 "model_versions_specified": { 134 "applies": true, 135 "answer": true, 136 "justification": "Table 3 specifies exact model versions: 'claude-3-5-haiku-20241022', 'gpt-4o-mini-2024-07-18', 'open-codestral-mamba', and 'llama-3.3-70b-versatile'. These include snapshot dates and specific API identifiers." 137 }, 138 "prompts_provided": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper describes what prompts do (intent detection, parameter extraction, pipeline generation) but does not provide the actual prompt text used for any LLM call. Only natural language descriptions of the prompting process are given." 142 }, 143 "hyperparameters_reported": { 144 "applies": true, 145 "answer": false, 146 "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported. The system configuration mentions 'maximum file size, chunk size' but no LLM sampling parameters are specified." 147 }, 148 "scaffolding_described": { 149 "applies": true, 150 "answer": true, 151 "justification": "The multi-agent scaffolding is described in detail: Algorithms 1-5 specify retry logic, model rotation, HGoT construction, step-by-step execution with dependencies, and fallback mechanisms. The three-phase architecture is thoroughly documented." 152 }, 153 "data_preprocessing_documented": { 154 "applies": true, 155 "answer": true, 156 "justification": "Section 4.1 describes preprocessing: repository cloning, file scanning, indexing of pipeline components (source/operator/sink), SHA-256 checksums, and annotation with metadata. The query preprocessing (normalization, whitespace removal, entity recognition) is also described." 157 } 158 }, 159 "limitations_and_scope": { 160 "limitations_section_present": { 161 "applies": true, 162 "answer": false, 163 "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion mentions future work (schema evolution, self-healing) but does not discuss limitations of the current approach." 164 }, 165 "threats_to_validity_specific": { 166 "applies": true, 167 "answer": false, 168 "justification": "No specific threats to validity are discussed anywhere in the paper. There is no consideration of internal, external, or construct validity threats." 169 }, 170 "scope_boundaries_stated": { 171 "applies": true, 172 "answer": false, 173 "justification": "The paper does not explicitly state what its results do NOT show. It does not bound its claims to the tested LLMs, SPEs, or pipeline types. No explicit exclusions or non-claims are made." 174 } 175 }, 176 "data_integrity": { 177 "raw_data_available": { 178 "applies": true, 179 "answer": false, 180 "justification": "While the queries and benchmarks are said to be in the GitHub repository, the raw experimental data (individual pipeline outputs, error logs, timing data for each run) is not made available for independent verification." 181 }, 182 "data_collection_described": { 183 "applies": true, 184 "answer": true, 185 "justification": "Section 5.1.1 describes the benchmark creation: 8 diverse SP applications, balanced across 3 complexity levels, with 2 query types each (full and partial), generated 5 times per SPE. The pipeline applications are depicted in Figure 6." 186 }, 187 "recruitment_methods_described": { 188 "applies": false, 189 "answer": false, 190 "justification": "No human participants are involved in this study. All evaluation is automated using benchmarks." 191 }, 192 "data_pipeline_documented": { 193 "applies": true, 194 "answer": true, 195 "justification": "The evaluation pipeline is documented: queries are submitted to each approach, pipelines are generated, then evaluated for syntax/logic/runtime errors, and EFS is computed using the formula in Equation 3. The error classification methodology (syntax, logic, runtime) is defined." 196 } 197 }, 198 "conflicts_of_interest": { 199 "funding_disclosed": { 200 "applies": true, 201 "answer": true, 202 "justification": "The Acknowledgement section states: 'This article is an output of a project supported by the Recovery and Resilience Plan of the Slovak Republic under the call Transformation and Innovation Consortia (project code: 09I02-03-V01-00012).'" 203 }, 204 "affiliations_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "Author affiliations are clearly stated: all three authors are from the Department of Computer Science, University of Innsbruck, Austria. No evaluated product belongs to the authors' institution." 208 }, 209 "funder_independent_of_outcome": { 210 "applies": true, 211 "answer": true, 212 "justification": "The funder (Slovak Republic Recovery and Resilience Plan, coordinated by InterWay a.s.) appears to be a government research funding body with no obvious financial stake in the specific results of this LLM pipeline generation research." 213 }, 214 "financial_interests_declared": { 215 "applies": true, 216 "answer": false, 217 "justification": "No competing interests or financial interests statement is present in the paper." 218 } 219 }, 220 "contamination": { 221 "training_cutoff_stated": { 222 "applies": true, 223 "answer": false, 224 "justification": "The paper evaluates LLMs (GPT-4o-mini, Claude Haiku, Codestral Mamba, Llama-3.3) on pipeline generation tasks but does not state the training data cutoff dates for any of these models." 225 }, 226 "train_test_overlap_discussed": { 227 "applies": true, 228 "answer": false, 229 "justification": "The benchmarks include well-known pipeline patterns (e.g., word count) that may appear in LLM training data. The paper does not discuss whether the LLMs might have seen similar pipeline implementations during training." 230 }, 231 "benchmark_contamination_addressed": { 232 "applies": true, 233 "answer": false, 234 "justification": "The word count pipeline is a classic benchmark that has been widely published. Apache Flink/Storm/Spark examples are abundantly available online. No contamination analysis is performed despite the high likelihood that these models have seen similar pipeline code." 235 } 236 }, 237 "human_studies": { 238 "pre_registered": { 239 "applies": false, 240 "answer": false, 241 "justification": "No human participants are involved in this study." 242 }, 243 "irb_or_ethics_approval": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants are involved in this study." 247 }, 248 "demographics_reported": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants are involved in this study." 252 }, 253 "inclusion_exclusion_criteria": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants are involved in this study." 257 }, 258 "randomization_described": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants are involved in this study." 262 }, 263 "blinding_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants are involved in this study." 267 }, 268 "attrition_reported": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants are involved in this study." 272 } 273 }, 274 "cost_and_practicality": { 275 "inference_cost_reported": { 276 "applies": true, 277 "answer": false, 278 "justification": "Table 3 lists API pricing per million tokens but the paper does not report the actual cost per pipeline generation. No total token consumption, API cost per query, or cost breakdown is provided despite using paid APIs." 279 }, 280 "compute_budget_stated": { 281 "applies": true, 282 "answer": false, 283 "justification": "The hardware is described (Intel i7-13700K, 64GB RAM) but no total computational budget, total API spend, or aggregate cost of running all experiments is stated." 284 } 285 } 286 }, 287 "claims": [ 288 { 289 "claim": "AutoStreamPipe reduces development time by 6.3x compared to manual coding for complex pipelines.", 290 "evidence": "Table 5 shows complex pipeline development: ASP takes 25-45 minutes vs manual 150-300 minutes. 300/45 ≈ 6.7x and 150/25 = 6x, supporting the ~6.3x claim.", 291 "supported": "moderate" 292 }, 293 { 294 "claim": "AutoStreamPipe reduces error rates by 5.19x compared to LLM code-generation methods.", 295 "evidence": "Table 4 and Section 5.3 show EFS comparisons. The 5.19x claim is stated in the abstract and conclusion but the exact derivation from the presented data is not explicitly shown. The improvement is consistent with comparing overall averages.", 296 "supported": "moderate" 297 }, 298 { 299 "claim": "AutoStreamPipe achieves near-perfect EFS of 0.98 for simple pipelines.", 300 "evidence": "Table 4 shows average EFS for simple pipelines across 3 SPEs: Flink=1.0, Storm=1.0, Spark=0.94, average=0.98.", 301 "supported": "strong" 302 }, 303 { 304 "claim": "AutoStreamPipe pipelines achieve 15-20% higher throughput and 30-35% lower latency than NiFi.", 305 "evidence": "Table 5 shows throughput and latency comparisons. For simple: ASP=150 vs NiFi=138 (8.7% higher). For medium: ASP=98 vs NiFi=81 (21% higher). For complex: ASP=72 vs NiFi=60 (20% higher). Latency improvements are consistent. The 15-20% throughput claim is supported.", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "HGoT extends Graph of Thoughts by introducing hyperedges for multi-way dependencies.", 310 "evidence": "Section 4.3 provides formal definitions (Equations 1-2), algorithms (3-4), and Table 2 comparing reasoning frameworks. The theoretical contribution is clearly presented.", 311 "supported": "strong" 312 }, 313 { 314 "claim": "The system is capable of generating a virtually unlimited number of SP pipelines.", 315 "evidence": "Section 1 makes this claim but no evidence supports 'virtually unlimited' generation. Only 8 pipeline types are tested in evaluation.", 316 "supported": "unsupported" 317 } 318 ], 319 "methodology_tags": [ 320 "benchmark-eval", 321 "case-study" 322 ], 323 "key_findings": "AutoStreamPipe automates stream processing pipeline generation using LLMs with a novel Hypergraph of Thoughts (HGoT) reasoning framework that captures multi-way dependencies between pipeline components. The system achieves near-perfect error-free scores for simple pipelines (0.98 EFS) but performance drops for complex pipelines (0.59 EFS). Development time is reduced by up to 6.3x compared to manual coding, while maintaining throughput within 2-8% of manually optimized code. The multi-agent architecture with model rotation and resilient execution provides robustness against API failures.", 324 "red_flags": [ 325 { 326 "flag": "No limitations section", 327 "detail": "The paper has no limitations, threats-to-validity, or scope-bounding section. This is a significant omission for a systems paper making broad claims about automated pipeline generation." 328 }, 329 { 330 "flag": "Baselines are only self-ablations", 331 "detail": "All baselines (Base-LLM, CoT Planning, GoT Based) are ablated versions of the proposed system. No independent competing approach for LLM-based pipeline generation is compared, making it impossible to assess relative performance against the state of the art." 332 }, 333 { 334 "flag": "No statistical rigor", 335 "detail": "Despite running each pipeline 5 times, no variance, standard deviations, or significance tests are reported. Claims of improvement are based on raw averages without uncertainty quantification." 336 }, 337 { 338 "flag": "Contamination risk unaddressed", 339 "detail": "The benchmarks include classic patterns (word count, temperature monitoring) that are widely available online and likely in LLM training data. The paper does not discuss whether the LLMs may have memorized similar pipeline implementations, which could inflate all results including the baselines." 340 }, 341 { 342 "flag": "Unbounded generalization claims", 343 "detail": "The paper claims the system can generate 'a virtually unlimited number of SP pipelines' (Section 1) based on testing only 8 pipeline types. Claims extend far beyond the tested settings." 344 }, 345 { 346 "flag": "Missing inference costs", 347 "detail": "API prices are listed in Table 3 but actual costs per pipeline generation are never reported. For a system that makes multiple LLM calls across multiple providers, this omission makes practical applicability hard to assess." 348 }, 349 { 350 "flag": "Anonymized repository", 351 "detail": "The GitHub URL uses an anonymized account (Anonymous0-0paper), suggesting the repository may be temporary for review and not the final release." 352 } 353 ], 354 "cited_papers": [ 355 { 356 "title": "Graph of thoughts: Solving elaborate problems with large language models", 357 "authors": ["M. Besta", "N. Blach", "A. Kubicek"], 358 "year": 2024, 359 "relevance": "Core reasoning framework that HGoT extends; directly relevant to LLM reasoning and structured problem-solving." 360 }, 361 { 362 "title": "Chain-of-thought prompting elicits reasoning in large language models", 363 "authors": ["J. Wei", "X. Wang", "D. Schuurmans"], 364 "year": 2022, 365 "relevance": "Foundational LLM reasoning technique used as a baseline comparison in this work." 366 }, 367 { 368 "title": "Tree of thoughts: Deliberate problem solving with large language models", 369 "authors": ["S. Yao", "D. Yu", "J. Zhao"], 370 "year": 2023, 371 "relevance": "Alternative LLM reasoning framework compared in Table 2; relevant to structured LLM reasoning approaches." 372 }, 373 { 374 "title": "GPT-4 technical report", 375 "authors": ["J. Achiam", "S. Adler", "S. Agarwal"], 376 "year": 2023, 377 "arxiv_id": "2303.08774", 378 "relevance": "Foundation model referenced as key enabler of LLM-based pipeline generation." 379 }, 380 { 381 "title": "Code llama: Open foundation models for code", 382 "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"], 383 "year": 2023, 384 "arxiv_id": "2308.12950", 385 "relevance": "Code-specialized LLM relevant to automated code generation capabilities." 386 }, 387 { 388 "title": "Competition-level code generation with AlphaCode", 389 "authors": ["Y. Li", "D. Choi", "J. Chung"], 390 "year": 2022, 391 "relevance": "Demonstrates competitive LLM code generation; relevant to code generation evaluation methodology." 392 }, 393 { 394 "title": "A survey on code generation with LLM-based agents", 395 "authors": ["Y. Dong", "X. Jiang", "J. Qian"], 396 "year": 2025, 397 "relevance": "Recent survey on LLM-based code generation agents; directly relevant to understanding the state of the field." 398 }, 399 { 400 "title": "Tool learning with large language models: A survey", 401 "authors": ["C. Qu", "S. Dai", "X. Wei"], 402 "year": 2025, 403 "relevance": "Survey on LLM tool use relevant to multi-agent systems and agentic workflows." 404 }, 405 { 406 "title": "LLMs for science: Usage for code generation and data analysis", 407 "authors": ["M. Nejjar", "L. Zacharias", "F. Stiehle"], 408 "year": 2025, 409 "relevance": "Evaluates LLM code generation for scientific applications; relevant to LLM capability assessment." 410 }, 411 { 412 "title": "Starcoder: may the source be with you!", 413 "authors": ["R. Li", "L. B. Allal", "Y. Zi"], 414 "year": 2023, 415 "arxiv_id": "2305.06161", 416 "relevance": "Code-specialized foundation model trained on large code corpora; relevant to code generation benchmarking." 417 }, 418 { 419 "title": "AutoFlow: Automated workflow generation for large language model agents", 420 "authors": ["Z. Li", "S. Xu", "K. Mei"], 421 "year": 2024, 422 "relevance": "Related LLM-based workflow generation system; relevant comparison point for automated workflow generation." 423 }, 424 { 425 "title": "A survey on evaluation of large language models", 426 "authors": ["Y. Chang", "X. Wang", "J. Wang"], 427 "year": 2024, 428 "relevance": "Comprehensive survey on LLM evaluation methodology; relevant to understanding evaluation standards." 429 } 430 ] 431 }