scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32821B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Deployability-Centric Infrastructure-as-Code Generation: An LLM-based Iterative Framework",
      6     "authors": [
      7       "Tianyi Zhang",
      8       "Shidong Pan",
      9       "Zejun Zhang",
     10       "Zhenchang Xing",
     11       "Xiaoyu Sun"
     12     ],
     13     "year": 2025,
     14     "venue": "FSE 2025 (to appear)",
     15     "arxiv_id": "2506.05623",
     16     "doi": "10.48550/arXiv.2506.05623"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims match results: 20.8–30.2% first-attempt rates (Table 2 passItr@1), 54.6–91.6% at 10 iterations (Table 2 passItr@10), >90% passItr@25 with human feedback (Fig. 9), 25.2% user requirement coverage (Table 4), 8.4% security compliance (Table 5).",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The main causal claim (IaCGen's iterative feedback improves deployment success) is supported by controlled ablation: with vs. without conversation history (Fig. 7), and decomposition by feedback level (Fig. 6). These are adequate single-variable manipulations.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Title ('Infrastructure-as-Code Generation') and abstract frame claims broadly, but the study is primarily CloudFormation-specific with only a small Terraform syntax-only extension. Section 7.4 acknowledges this but the title and abstract overclaim the generality.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Section 7.4 discusses threats (model evolution, benchmark scope, difficulty categorization) but these are generalizability concerns, not alternative explanations for the observed results. For example, whether similar improvement could come from simple retrying without structured feedback, or from prompt-only approaches, is not discussed.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly acknowledges that deployment success (the primary metric) does not capture full IaC quality, and supplements it with user intent matching (Section 6.4) and security compliance analysis. This distinction between deployment success and overall template utility is clearly stated.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7.4 'Threats to Validity' provides a substantive discussion of limitations across multiple paragraphs.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 7.4 identifies threats specific to this study: models available at time of writing may yield different results, 153 scenarios may not capture 'highly specialized configurations', difficulty categorization 'may not align perfectly with all organizational perspectives', and absence of standardized benchmarks across IaC languages.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 7.4 explicitly states boundaries: focus on CloudFormation, 153 scenarios across 58 services, specific model set, and acknowledged limitation to standardized benchmarks. Section 2.2 also explicitly explains the focus on CloudFormation over Terraform for methodological reasons.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding source, grants, or acknowledgments section is present in the paper.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All author affiliations are clearly listed: Australian National University, NYU & Columbia University, Nanyang Technological University, and CSIRO's Data61. None are affiliated with the companies whose LLMs are evaluated.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Cannot be determined since no funding source is disclosed. Authors are at academic institutions not affiliated with evaluated LLM providers, suggesting likely independence, but no explicit disclosure.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are defined precisely in Section 2: IaC, resources, parameters, resource properties; passItr@n is formally defined in Section 5 and contrasted with pass@k; deployability is defined as live deployment success throughout.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit contributions are bulleted in Section 1: DPIaC-Eval benchmark, IaCGen framework, and empirical evidence about model performance across multiple dimensions of IaC quality.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 3.3 directly compares DPIaC-Eval to IaC-Eval on concrete dimensions (template length, service count, evaluation dimensions); Section 8 situates the feedback mechanism relative to prior LLM feedback work and explains how IaCGen addresses limitations of Palavalli et al. and Ragothaman.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Replication package released at https://github.com/Tianyi2/IaCGen (Section 9, Data Availability): 'The Code folder contains the code of our IaCGen framework.'",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The replication package includes the benchmark: 'The Data folder contains the benchmarks' (Section 9, Data Availability).",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper states temperature=0 and 8000 max tokens, and mentions yamllint, cfn-lint, boto3, but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "The replication package contains READMEs: 'Detailed descriptions of files can be found in the README.md file within each folder' (Section 9). The repository URL is provided.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 2, 4, and 5 are reported as point estimates with no confidence intervals, error bars, or ± notation.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "Claims of model differences (e.g., 'Claude models show impressive performance') are based on comparing raw percentages without any statistical significance tests.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Effect sizes are reported with baseline context throughout: '≈24.7% passItr@1 to ≈74.8% passItr@15, a near 200% performance improvement' (Section 6.1), '15.9% reduction in required iterations' (Section 6.1), and percentage point gains for each feedback level.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No justification is given for the final benchmark size of 153 scenarios. The 51-instance subsample for user intent evaluation is described as 'randomly sampled' but no power analysis or sample size justification is provided.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "All experiments use temperature=0 for deterministic output and appear to be single-run. No variance, standard deviation, or spread measures are reported across experimental runs.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Multiple baselines: passItr@1 (no iteration), IaCGen without conversation history (Fig. 7), general feedback only vs. detailed feedback (Fig. 6), and comparison with IaC-Eval benchmark (Section 6.1).",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "All six evaluated models are contemporary: GPT-4o, GPT-o3-mini, Claude-3.5, Claude-3.7, DeepSeek-R1, and DeepSeek-V3, representing state-of-the-art at time of writing.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Ablation on conversation history (Fig. 7, Section 6.1): IaCGen with vs. without full conversation history using Claude-3.5. Also, decomposition of feedback levels (no feedback / general / detailed) in Fig. 6.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple metrics: passItr@n at various n (deployment success), user intent matching at resource and attribute levels (Table 4), three security compliance metrics (Table 5), and error stage distribution (Fig. 8).",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "All evaluation of system outputs is automated: deployment pass/fail via boto3, user intent matching via Checkov custom policies, security via Checkov. The human-in-the-loop component (RQ3) provides feedback during generation, not evaluation of outputs.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "All 153 benchmark scenarios are used for evaluation with no explicit train/dev/test separation. The framework design (feedback structure, iteration limits) was developed with knowledge of the benchmark, though iteration limits are justified by prior work [22].",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Extensive breakdowns: by difficulty level (Fig. 4), by model (Tables 2, 4, 5), by error stage (Fig. 8), by error type (Table 3), and by feedback level (Fig. 6).",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "RQ2 (Section 6.2) provides detailed failure analysis: five primary error categories (Missing Value, Self-defined Property, Null Substitution, Unnecessary Whitespace, Arbitrary Default Value) with counts per model and explanations.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Several negative results reported: poor initial pass rates (≈24.7%), poor user intent alignment (25.2% combined coverage), very low security compliance (8.4%), and GPT-4o's poor deployment error resolution.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Models are identified only by marketing names: 'GPT-4o, GPT-o3-mini, Claude-3.5, Claude-3.7, DeepSeek-R1, and DeepSeek-V3' (Section 5). No API version strings or snapshot dates. Section 7.4 notes 'we document the specific model and their cut-off date in our replication package' but the paper itself does not specify versions.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "The paper describes prompt structure ('We need a CloudFormation template that creates [description]...') and states 'The complete prompts and other reproducible details mentioned in this section are available in our code repository' (Section 4.1) at https://github.com/Tianyi2/IaCGen.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Section 5: 'All models are evaluated with consistent temperature settings of 0 to maximize deterministic outputs and configured with the 8,000 maximum output token limit.' Iteration limits (2 general + 4 detailed feedback) also specified.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "IaCGen framework described in detail in Section 4 with workflow diagram (Fig. 5), three-stage validation pipeline, conversation history management, feedback mechanism design, and iteration logic.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 3.1 documents the full preprocessing pipeline with counts at each stage: ~900 templates → 850 (size filtering) → 465 (syntax validation) → 200 (deployment testing) → 153 (rectification). Filtering criteria at each stage are described.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "The replication package at https://github.com/Tianyi2/IaCGen contains the benchmark data and framework code (Section 9, Data Availability).",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 3.1 describes template sourcing from AWS documentation, AWS Samples GitHub, and a CloudFormation repository dataset [38], with ethical licensing checks and multi-stage preprocessing.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "The paper mentions 'one DevOps practitioner with three years of AWS and CloudFormation experience' and 'two additional DevOps practitioners with equivalent expertise' for benchmark construction, but does not describe how these practitioners were recruited or selected.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Full pipeline documented in Section 3.1: ~900 initial templates → 850 after size filtering → 465 after syntax validation → 200 after deployment testing → 153 after rectification. Each stage's criteria are described.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No model training cutoff dates are stated in the paper. Section 7.4 mentions 'we document the specific model and their cut-off date in our replication package' but this information is not in the paper itself.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Templates are sourced from public AWS documentation and GitHub repositories, which are likely in LLM training data. No discussion of potential overlap between these public templates and model training data.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "The benchmark templates originate from public AWS CloudFormation sample templates [36] and GitHub repositories [37, 38], which are highly likely to appear in LLM training corpora. This contamination risk is not discussed.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in the study. The DevOps practitioners involved in benchmark construction and the cloud engineer providing feedback are part of the methodology design, not study participants.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants study. This is a benchmark evaluation of LLMs.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants study.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants study.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants study.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants study.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants study.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Section 5 reports total API cost ($230.75), per-model costs (Claude-3.7 at $0.42/template, DeepSeek-V3 at $0.04/template), and deployment cost ($0.04/template). Total minimum cost of $0.08 per deployable template is stated.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Section 5 states total LLM API cost ($230.75) and AWS deployment cost ($35.21), totaling $265.96 for the full evaluation. Per-template costs are broken down by model.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "Temperature is set to 0 for deterministic output, but no multi-run analysis is performed to verify determinism across API calls or assess sensitivity to non-deterministic factors.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The paper implies single runs via temperature=0 but never explicitly states the number of experimental runs per configuration.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "No hyperparameter search is described. The iteration limit allocation (2 general + 4 detailed) is justified by prior work [22] but no systematic search over framework parameters is reported.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "Framework configuration (feedback structure, iteration limits) is fixed based on prior work [22] but no systematic comparison of alternative configurations is provided to justify that the chosen design is optimal.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "The paper compares 6 models across multiple metrics and difficulty levels, making numerous implicit comparisons, but performs no statistical tests and therefore no multiple comparison correction.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors developed both the DPIaC-Eval benchmark and IaCGen framework, then evaluate the framework on their own benchmark. No acknowledgment of author-evaluation bias or independent evaluation.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "Per-model API costs are reported (Section 5), but performance is not analyzed as a function of compute budget. Claude-3.7 ($0.42/template) vs DeepSeek-V3 ($0.04/template) represents a 10x cost difference but cost-performance tradeoff is not discussed.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": true,
    416           "justification": "The paper explicitly addresses construct validity by showing deployment success alone is insufficient: 42.7% of syntactically correct templates fail deployment (Section 3.3), and RQ4 demonstrates deployable templates still fail user intent (25.2%) and security (8.4%) tests, motivating multi-dimensional evaluation.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": true,
    421           "answer": true,
    422           "justification": "The same IaCGen framework is used consistently across all 6 model comparisons, controlling for scaffold differences. Model comparisons are within the same scaffold.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "Benchmark templates are sourced from public AWS documentation and GitHub repositories published before all evaluated models' training. No discussion of whether models may have seen these templates or similar ones during training.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether the natural language prompts or error messages in the iterative loop provide information that constitutes leakage beyond what would be available in real-world usage.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "Multiple benchmark templates share AWS service families and structural patterns. No discussion of whether templates from the same service are structurally similar enough to inflate performance estimates.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No concrete leakage detection methods (canary strings, membership inference, n-gram overlap analysis, decontamination) are applied despite the benchmark being sourced from publicly available templates.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "State-of-the-art LLMs achieve only 20.8-30.2% deployment success rate on first attempt for CloudFormation IaC generation",
    457       "evidence": "Table 2 passItr@1: GPT-4o 22.7%, GPT-o3-mini 20.8%, Claude-3.5 30.2%, Claude-3.7 26.8%, DeepSeek-R1 22.9%, DeepSeek-V3 24.2%",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "IaCGen improves deployment success to 54.6-91.6% within 10 iterations across all six models",
    462       "evidence": "Table 2 passItr@10 column shows the range; Claude-3.5 achieves 91.6%, GPT-4o achieves 54.6% as the weakest",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "Human-in-the-loop feedback enables all six LLMs to exceed 90% passItr@25",
    467       "evidence": "Fig. 9 and Section 6.3 show all models cross 90% with human feedback; Claude models reach 98%; improvements range from 2.6% to 37.9% percentage points",
    468       "supported": "strong"
    469     },
    470     {
    471       "claim": "Maintaining full conversation history reduces average iterations to deployment by 15.9% versus immediate-context-only baseline",
    472       "evidence": "Ablation study on Claude-3.5 (Fig. 7): IaCGen averages 4.55 iterations vs baseline 5.41 iterations; limited to one model",
    473       "supported": "moderate"
    474     },
    475     {
    476       "claim": "Only 25.2% of LLM-generated IaC templates satisfy both resource-level and attribute-level user intent",
    477       "evidence": "Table 4: average across models is 58.8% resource-level, 40.5% attribute-level, 25.2% combined; evaluated on only 51 of 153 templates",
    478       "supported": "moderate"
    479     },
    480     {
    481       "claim": "Security compliance of LLM-generated deployable templates is critically low at 8.4% (filtered compliance rate)",
    482       "evidence": "Table 5: filtered compliance ranges from 6.1% (GPT-4o) to 11.5% (DeepSeek-V3), averaging 8.4%; 70 distinct policy violations identified across all templates",
    483       "supported": "strong"
    484     },
    485     {
    486       "claim": "IaCGen generalizes to Terraform, achieving 100% passItr@7 on IaC-Eval benchmark with Claude-3.5",
    487       "evidence": "Section 6.1 reports 79.7% passItr@1 and 100% passItr@7 on Terraform IaC-Eval; requires only 1.58 average iterations vs 2.53 for CloudFormation DPIaC-Eval",
    488       "supported": "moderate"
    489     }
    490   ],
    491   "methodology_tags": [
    492     "benchmark-eval",
    493     "case-study"
    494   ],
    495   "key_findings": "Current LLMs are poor at generating deployable AWS CloudFormation templates (20.8-30.2% deployment success on first attempt), far below their general code generation performance (~95% on HumanEval). The IaCGen framework's multi-stage iterative feedback mechanism substantially improves this to 54.6-91.6% in 10 iterations, with Claude models exceeding 90% at 15 iterations without human intervention. Despite deployment success, LLM-generated IaC templates remain fundamentally untrustworthy: only 25.2% satisfy user intent at both resource and attribute levels, and only 8.4% achieve full security compliance—revealing that deployability is necessary but far from sufficient for production-ready infrastructure code.",
    496   "red_flags": [
    497     {
    498       "flag": "No statistical tests on model comparisons",
    499       "detail": "Comparative claims between six models are made without significance tests or confidence intervals; large performance gaps (e.g., Claude-3.5 91.6% vs GPT-4o 54.6% at passItr@10) are presented as factual without accounting for sampling uncertainty across only 153 templates."
    500     },
    501     {
    502       "flag": "Benchmark contamination unaddressed",
    503       "detail": "DPIaC-Eval uses CloudFormation templates from public GitHub repositories (aws-cloudformation-templates, AWS Samples) and AWS documentation that predate LLM training cutoffs; no discussion of whether models may have seen these templates, which would inflate performance estimates."
    504     },
    505     {
    506       "flag": "Single-model ablation",
    507       "detail": "The conversation history ablation is conducted only on Claude-3.5; whether the 15.9% iteration reduction generalizes to GPT-4o, DeepSeek-V3, or other architectures is untested."
    508     },
    509     {
    510       "flag": "Partial intent evaluation sample",
    511       "detail": "User intent matching is evaluated on only 51 of 153 benchmark templates (randomly sampled), and requires manual expert annotation of intent specifications; the sampled subset may not be representative."
    512     },
    513     {
    514       "flag": "No funding disclosure",
    515       "detail": "No funding source or competing interests statement appears anywhere in the paper; unknown whether industry funding (e.g., AWS, Anthropic, OpenAI) may create conflicts of interest given model comparisons."
    516     },
    517     {
    518       "flag": "AWS account dependency for reproduction",
    519       "detail": "Reproducing results requires a live AWS sandbox account with specific least-privilege IAM configuration and incurs real AWS costs; this is a substantial undocumented barrier to independent verification."
    520     }
    521   ],
    522   "cited_papers": [
    523     {
    524       "title": "IaC-Eval: A code generation benchmark for cloud infrastructure-as-code programs",
    525       "relevance": "Primary prior benchmark for LLM IaC generation; DPIaC-Eval directly builds on and extends this work by adding deployability evaluation beyond syntax correctness"
    526     },
    527     {
    528       "title": "Evaluating large language models trained on code (Codex/HumanEval)",
    529       "relevance": "Foundational code generation benchmark establishing pass@k metric used as comparison baseline for general programming task performance"
    530     },
    531     {
    532       "title": "Using a feedback loop for LLM-based infrastructure as code generation",
    533       "relevance": "Most directly related prior work on IaC feedback loops; IaCGen addresses its limitations of syntax-only evaluation and incomplete feedback strategy"
    534     },
    535     {
    536       "title": "Teaching large language models to self-debug",
    537       "relevance": "Related work on feedback-driven LLM code refinement that IaCGen extends to the IaC domain"
    538     },
    539     {
    540       "title": "Self-refine: Iterative refinement with self-feedback",
    541       "relevance": "Related iterative self-improvement framework whose design principles inform IaCGen's feedback architecture"
    542     },
    543     {
    544       "title": "A survey of using large language models for generating infrastructure as code",
    545       "relevance": "Contextualizes the IaC generation research landscape and motivates the deployability focus"
    546     },
    547     {
    548       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    549       "relevance": "Related benchmark evaluation methodology comparing LLM code generation quality beyond surface metrics"
    550     }
    551   ],
    552   "engagement_factors": {
    553     "practical_relevance": {
    554       "score": 2,
    555       "justification": "IaCGen addresses a real DevOps pain point and the replication package is released, but it's a research framework rather than a production-ready tool."
    556     },
    557     "surprise_contrarian": {
    558       "score": 1,
    559       "justification": "The finding that iterative feedback helps is expected; the mildly surprising element is how poorly LLMs perform on first-attempt deployment (20–30%) despite strong general code generation abilities."
    560     },
    561     "fear_safety": {
    562       "score": 1,
    563       "justification": "The 8.4% security compliance rate raises concerns about deploying LLM-generated infrastructure, but this is an expected finding rather than a novel attack or existential concern."
    564     },
    565     "drama_conflict": {
    566       "score": 0,
    567       "justification": "No controversy or confrontational claims; the paper is a straightforward benchmark and framework evaluation."
    568     },
    569     "demo_ability": {
    570       "score": 2,
    571       "justification": "Code and benchmark are released on GitHub, and a practitioner with AWS access could run the framework, though it requires AWS credentials and API keys."
    572     },
    573     "brand_recognition": {
    574       "score": 1,
    575       "justification": "Evaluates well-known models (GPT-4o, Claude, DeepSeek) but the paper itself is from academic institutions without major brand recognition."
    576     }
    577   },
    578   "hn_data": {
    579     "threads": [],
    580     "top_points": 0,
    581     "total_points": 0,
    582     "total_comments": 0
    583   }
    584 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs