scan-v5.json (25609B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Deployability-Centric Infrastructure-as-Code Generation: An LLM-based Iterative Framework", 6 "authors": [ 7 "Tianyi Zhang", 8 "Shidong Pan", 9 "Zejun Zhang", 10 "Zhenchang Xing", 11 "Xiaoyu Sun" 12 ], 13 "year": 2025, 14 "venue": "FSE (submitted)", 15 "arxiv_id": "2506.05623", 16 "doi": "10.48550/arXiv.2506.05623" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "All abstract claims are verifiable in the paper: the 20.8–30.2% first-attempt success rates match Table 2, 54.6–91.6% passItr@10 matches Table 2, >90% passItr@25 with human feedback matches Section 6.3, 25.2% intent coverage and 8.4% filtered compliance match Tables 4 and 5.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Causal claims about conversation history reducing error recurrence are supported by an ablation study comparing IaCGen with and without conversation history on Claude-3.5 (Fig. 7), showing 15.9% reduction in required iterations.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": true, 35 "justification": "The paper scopes its main claims to AWS CloudFormation and explicitly notes in the threats section that highly specialized configurations may not be captured; Terraform generalizability is tested only with Claude-3.5 on syntax validation, which is clearly stated as a limitation.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper does not discuss alternative explanations for the main result that iterative feedback improves deployment success — for example, whether more LLM calls alone (without structured feedback) would produce similar gains.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper explicitly distinguishes syntactic correctness from deployability and argues deployability is the more meaningful measure; it separately reports policy-level compliance (75.3%) versus template-level compliance (8.4%), clearly distinguishing the two.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section 7.4 is a dedicated 'Threats to Validity' section covering multiple specific concerns about model versions, benchmark coverage, and language scope.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Threats include specific statements such as limiting to 153 scenarios across 58 AWS services, the gap between CloudFormation and Terraform evaluation depth, and the time-bound nature of model evaluation at time of writing.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper explicitly states it focuses on AWS CloudFormation (not other IaC tools), uses 153 benchmark scenarios, and that highly specialized configurations may not be captured.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding acknowledgment or disclosure appears anywhere in the provided paper text.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are clearly stated on the title page: ANU, NYU/Columbia, NTU, CSIRO's Data61.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funding is disclosed, so independence cannot be assessed.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial interests declaration appears in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Key terms including IaC, IaC templates, resources, parameters, deployability, and the novel passItr@n metric are all explicitly defined in the paper.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "The contributions are explicitly enumerated: DPIaC-Eval benchmark, IaCGen framework, and empirical evidence about model performance across multiple quality dimensions.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 3.3 directly compares DPIaC-Eval to the prior IaC-Eval benchmark, and Section 8 situates the work relative to feedback mechanisms and LLM-based IaC generation literature, explaining how each prior approach falls short.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "The replication package including the IaCGen code is available at https://github.com/Tianyi2/IaCGen, explicitly stated in the Data Availability section.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "The Data folder in the replication package contains the DPIaC-Eval benchmark, as stated in the Data Availability section.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper mentions tools used (boto3, yamllint, cfn-linter, Checkov) but provides no requirements.txt, Dockerfile, or equivalent environment specification; details are deferred to a README in the replication package without confirmation of completeness.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper defers reproduction details to the replication package README but provides no step-by-step instructions in the paper itself; the paper text only describes the workflow at a high level.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "All results in Tables 2, 4, and 5 are reported as single percentage values with no confidence intervals or error bars.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are applied to comparative claims (e.g., Claude-3.5 91.6% vs GPT-4o 54.6% at passItr@10).", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Effect sizes are reported as percentage improvements (e.g., 'near 200% performance improvement' from passItr@1 to passItr@15, 15.9% reduction in iterations with conversation history).", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "The benchmark size of 153 scenarios is described by its construction process but not statistically justified; no power analysis or sample size rationale is provided.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "All results are reported as single point estimates; no variance, standard deviation, or confidence intervals across multiple runs are reported.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Baselines include pass@1 performance without iterative feedback and a conversation-history ablation comparing IaCGen to providing only the latest error without history.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "All six evaluated models (GPT-4o, GPT-o3-mini, Claude-3.5, Claude-3.7, DeepSeek-R1, DeepSeek-V3) are current state-of-the-art models.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "An ablation study comparing IaCGen with and without complete conversation history is conducted using Claude-3.5 (Fig. 7), showing the contribution of the conversation history component.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "The paper uses passItr@n for deployability, resource/attribute-level intent matching, and three security compliance metrics (policy pass rate, unfiltered compliance, filtered compliance).", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": true, 205 "answer": true, 206 "justification": "Human-in-the-loop feedback from a cloud engineer is evaluated in RQ3, and a DevOps expert manually crafted intent specifications for 51 benchmark samples for user intent matching evaluation.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "DPIaC-Eval serves as the held-out test set; LLMs are not fine-tuned on any portion of it and are evaluated zero-shot.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Results are broken down by difficulty level (Fig. 4), error stage (Fig. 8), error type (Table 3), and per-model performance across all metrics (Tables 2, 4, 5).", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Section 6.2 analyzes five specific error categories (Missing Value, Self-defined Property, Null Substitution, Unnecessary Whitespace, Arbitrary Default Value) with per-model failure counts and root cause analysis.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "The paper honestly reports negative findings: only 8.4% filtered security compliance, only 25.2% user intent satisfaction, GPT-4o's substantially lower performance (55.2% vs Claude's 95.5% at passItr@15).", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": false, 238 "justification": "Model names such as 'Claude-3.5', 'Claude-3.7', 'GPT-4o', 'GPT-o3-mini' are used without specifying exact version identifiers or snapshot dates; the paper only promises these details are in the replication package.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": false, 244 "justification": "Full prompts are not included in the paper; the system prompt structure is described but actual prompt text is deferred to the code repository.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Temperature is set to 0 and maximum output token limit of 8,000 is explicitly stated for all model evaluations.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": true, 255 "answer": true, 256 "justification": "The IaCGen framework is described in detail in Section 4, including the three validation stages (format verification, syntax checking, live deployment) and the feedback allocation strategy (2 general + 4 detailed attempts per stage).", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "The benchmark construction pipeline is documented with specific filtering steps and template counts at each stage: 900→850 (size filtering)→465 (syntax check)→200 (deployment test)→153 (rectification), shown in Fig. 2.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": true, 270 "justification": "The DPIaC-Eval benchmark (153 templates and prompts) is available in the replication package's Data folder.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Section 3.1 describes template sources (AWS documentation, AWS Samples GitHub, GitHub repositories using CloudFormation), ethical licensing checks (MIT, Apache 2.0), and the multi-stage preprocessing pipeline.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants were recruited as study subjects; DevOps practitioners were used for benchmark construction but not as experimental participants.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "The complete data pipeline from collection to final benchmark is documented in Section 3.1 and illustrated in Fig. 2, including filtering criteria and counts at each stage.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "Training data cutoffs for the six LLMs are not stated in the paper; the paper only mentions these will be documented in the replication package.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "The DPIaC-Eval templates were sourced from publicly available GitHub repositories and AWS documentation that predate the LLMs' training cutoffs; potential overlap is never discussed.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "The benchmark templates are from public GitHub repositories and AWS sample libraries that were almost certainly available before the LLMs' training cutoffs; the paper does not address this contamination risk.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants as experimental subjects; DevOps practitioners were used only for benchmark construction.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human subjects research; ethics mentions relate only to IP licensing of templates.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants as experimental subjects.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants as experimental subjects.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants as experimental subjects.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants as experimental subjects.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants as experimental subjects.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": true, 360 "justification": "Per-template costs are reported: Claude-3.7-Sonnet $0.42 (most expensive), DeepSeek-V3 $0.04 (cheapest), AWS deployment $0.04 per deployable template.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": true, 366 "justification": "Total study costs are explicitly stated: $230.75 for LLM API tokens and $35.21 for AWS deployment validation.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Six state-of-the-art LLMs achieve only 20.8–30.2% deployment success rate on the first attempt at IaC template generation.", 375 "evidence": "Table 2 shows passItr@1 results: GPT-4o 22.7%, GPT-o3-mini 20.8%, Claude-3.5 30.2%, Claude-3.7 26.8%, DeepSeek-R1 22.9%, DeepSeek-V3 24.2%.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "IaCGen achieves 54.6–91.6% deployment success in 10 iterations across all evaluated models.", 380 "evidence": "Table 2 shows passItr@10: GPT-4o 54.6%, GPT-o3-mini 66.2%, Claude-3.5 91.6%, Claude-3.7 86.9%, DeepSeek-R1 68.0%, DeepSeek-V3 56.9%.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Maintaining complete conversation history reduces required iterations by 15.9% compared to providing only the most recent error.", 385 "evidence": "Ablation study (Fig. 7) on Claude-3.5 shows IaCGen averages 4.55 iterations vs. baseline's 5.41 iterations to achieve deployable templates.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Human-in-the-loop feedback enables all six models to exceed 90% passItr@25.", 390 "evidence": "Section 6.3 and Fig. 9 show all models surpass 90% passItr@25 with human feedback; Claude models reach 98%.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Only 25.2% of generated IaC templates fully satisfy user intent at both resource and attribute level.", 395 "evidence": "Table 4 shows average resource-level matching of 58.8%, attribute-level 40.5%, and combined Resource & Attribute only 25.2% across all models.", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "Only 8.4% of generated deployable templates achieve full security compliance when filtered for applicable policies.", 400 "evidence": "Table 5 shows filtered compliance rates ranging from 6.1% (GPT-4o) to 11.5% (DeepSeek-V3), averaging 8.4%.", 401 "supported": "moderate" 402 }, 403 { 404 "claim": "IaCGen generalizes to Terraform, achieving 100% passItr@7 syntax accuracy with Claude-3.5 on IaC-Eval benchmark.", 405 "evidence": "Section 6.1 reports 79.7% passItr@1 and 100% passItr@7 on IaC-Eval Terraform benchmark with an average of 1.58 iterations.", 406 "supported": "weak" 407 } 408 ], 409 "methodology_tags": [ 410 "benchmark-eval" 411 ], 412 "key_findings": "Current LLMs are poor at generating deployable AWS CloudFormation templates with only 20.8–30.2% first-attempt success, despite reasonable syntactic correctness. The IaCGen iterative feedback framework dramatically improves this to 54.6–91.6% within 10 iterations by simulating real DevOps workflows with progressive validation stages. Security compliance of generated templates is alarmingly low at 8.4% filtered compliance, and user intent matching is weak at 25.2% combined resource-and-attribute satisfaction, indicating that deployability is necessary but far from sufficient for practical utility. Maintaining complete conversation history is more effective than isolated-feedback approaches, as it prevents 'Error Recurrence' where LLMs reintroduce previously corrected mistakes.", 413 "red_flags": [ 414 { 415 "flag": "No statistical significance testing", 416 "detail": "All comparative claims between models and conditions are made without statistical tests, despite clear numerical differences that require significance assessment." 417 }, 418 { 419 "flag": "Benchmark contamination unaddressed", 420 "detail": "DPIaC-Eval templates were sourced from public GitHub repositories and AWS documentation that predate the LLMs' training cutoffs; potential memorization of test templates is never discussed." 421 }, 422 { 423 "flag": "User intent evaluation on 51/153 samples", 424 "detail": "The intent matching evaluation (RQ4) uses only 51 randomly sampled instances from the 153-template benchmark, reducing statistical power for this important finding." 425 }, 426 { 427 "flag": "Vague model version identifiers", 428 "detail": "Model names like 'Claude-3.5' and 'Claude-3.7' are not fully specified in the paper; exact version/snapshot identifiers are deferred to the replication package only." 429 }, 430 { 431 "flag": "Single-run results, no variance", 432 "detail": "All results appear to be from single evaluation runs with no variance reported across runs, despite using temperature=0 which only partially addresses stochasticity." 433 }, 434 { 435 "flag": "Terraform generalizability underpowered", 436 "detail": "Terraform generalizability is tested with only Claude-3.5 and only measures syntax correctness (not deployability), making the generalizability claim much weaker than presented." 437 } 438 ], 439 "cited_papers": [ 440 { 441 "title": "IaC-Eval: A Code Generation Benchmark for Cloud Infrastructure-as-Code Programs", 442 "relevance": "Primary prior benchmark for LLM IaC generation; DPIaC-Eval is directly compared and extended from this work." 443 }, 444 { 445 "title": "Evaluating Large Language Models Trained on Code (HumanEval)", 446 "relevance": "Standard code generation benchmark used as reference point; IaC success rates (19–30%) are contrasted with HumanEval rates (~95%)." 447 }, 448 { 449 "title": "Teaching Large Language Models to Self-Debug", 450 "relevance": "Related feedback mechanism approach for code generation; IaCGen extends this concept to IaC with multi-stage deployment feedback." 451 }, 452 { 453 "title": "Self-Refine: Iterative Refinement with Self-Feedback", 454 "relevance": "Foundational iterative refinement approach that IaCGen builds upon, extending to deployment-validated IaC generation." 455 }, 456 { 457 "title": "Using a Feedback Loop for LLM-based Infrastructure as Code Generation", 458 "relevance": "Most closely related prior work; IaCGen improves upon it by preserving conversation history and including live deployment validation." 459 }, 460 { 461 "title": "RepoCoder: Repository-Level Code Completion through Iterative Retrieval and Generation", 462 "relevance": "Related iterative feedback approach for code generation that only provides immediate error messages, contrasted with IaCGen's full conversation history approach." 463 } 464 ], 465 "engagement_factors": { 466 "practical_relevance": { 467 "score": 3, 468 "justification": "Directly addresses a pain point for DevOps practitioners — automating CloudFormation template generation with a working framework and public replication package." 469 }, 470 "surprise_contrarian": { 471 "score": 2, 472 "justification": "The finding that syntactic correctness is nearly useless as an IaC quality metric (42.7% of syntactically valid templates fail deployment) challenges how the field has been evaluating LLMs for IaC." 473 }, 474 "fear_safety": { 475 "score": 1, 476 "justification": "The 8.4% security compliance finding is concerning for cloud security practitioners but is framed as a research gap rather than an imminent risk." 477 }, 478 "drama_conflict": { 479 "score": 1, 480 "justification": "Claude vs GPT comparison shows dramatic performance difference (95.5% vs 55.2% passItr@15) that practitioners will notice, but framing is academic rather than dramatic." 481 }, 482 "demo_ability": { 483 "score": 2, 484 "justification": "Code is publicly available on GitHub and the framework can be run against the DPIaC-Eval benchmark, though it requires AWS account setup and API keys." 485 }, 486 "brand_recognition": { 487 "score": 1, 488 "justification": "Authors are from ANU, NTU, and CSIRO — established institutions but not AI lab brand names; venue is FSE, a respected but not top-tier AI conference." 489 } 490 }, 491 "hn_data": { 492 "threads": [], 493 "top_points": 0, 494 "total_points": 0, 495 "total_comments": 0 496 } 497 }