scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25609B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Deployability-Centric Infrastructure-as-Code Generation: An LLM-based Iterative Framework",
      6     "authors": [
      7       "Tianyi Zhang",
      8       "Shidong Pan",
      9       "Zejun Zhang",
     10       "Zhenchang Xing",
     11       "Xiaoyu Sun"
     12     ],
     13     "year": 2025,
     14     "venue": "FSE (submitted)",
     15     "arxiv_id": "2506.05623",
     16     "doi": "10.48550/arXiv.2506.05623"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims are verifiable in the paper: the 20.8–30.2% first-attempt success rates match Table 2, 54.6–91.6% passItr@10 matches Table 2, >90% passItr@25 with human feedback matches Section 6.3, 25.2% intent coverage and 8.4% filtered compliance match Tables 4 and 5.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims about conversation history reducing error recurrence are supported by an ablation study comparing IaCGen with and without conversation history on Claude-3.5 (Fig. 7), showing 15.9% reduction in required iterations.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper scopes its main claims to AWS CloudFormation and explicitly notes in the threats section that highly specialized configurations may not be captured; Terraform generalizability is tested only with Claude-3.5 on syntax validation, which is clearly stated as a limitation.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss alternative explanations for the main result that iterative feedback improves deployment success — for example, whether more LLM calls alone (without structured feedback) would produce similar gains.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly distinguishes syntactic correctness from deployability and argues deployability is the more meaningful measure; it separately reports policy-level compliance (75.3%) versus template-level compliance (8.4%), clearly distinguishing the two.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7.4 is a dedicated 'Threats to Validity' section covering multiple specific concerns about model versions, benchmark coverage, and language scope.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Threats include specific statements such as limiting to 153 scenarios across 58 AWS services, the gap between CloudFormation and Terraform evaluation depth, and the time-bound nature of model evaluation at time of writing.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states it focuses on AWS CloudFormation (not other IaC tools), uses 153 benchmark scenarios, and that highly specialized configurations may not be captured.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment or disclosure appears anywhere in the provided paper text.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly stated on the title page: ANU, NYU/Columbia, NTU, CSIRO's Data61.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so independence cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests declaration appears in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms including IaC, IaC templates, resources, parameters, deployability, and the novel passItr@n metric are all explicitly defined in the paper.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The contributions are explicitly enumerated: DPIaC-Eval benchmark, IaCGen framework, and empirical evidence about model performance across multiple quality dimensions.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 3.3 directly compares DPIaC-Eval to the prior IaC-Eval benchmark, and Section 8 situates the work relative to feedback mechanisms and LLM-based IaC generation literature, explaining how each prior approach falls short.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The replication package including the IaCGen code is available at https://github.com/Tianyi2/IaCGen, explicitly stated in the Data Availability section.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The Data folder in the replication package contains the DPIaC-Eval benchmark, as stated in the Data Availability section.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions tools used (boto3, yamllint, cfn-linter, Checkov) but provides no requirements.txt, Dockerfile, or equivalent environment specification; details are deferred to a README in the replication package without confirmation of completeness.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper defers reproduction details to the replication package README but provides no step-by-step instructions in the paper itself; the paper text only describes the workflow at a high level.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 2, 4, and 5 are reported as single percentage values with no confidence intervals or error bars.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to comparative claims (e.g., Claude-3.5 91.6% vs GPT-4o 54.6% at passItr@10).",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Effect sizes are reported as percentage improvements (e.g., 'near 200% performance improvement' from passItr@1 to passItr@15, 15.9% reduction in iterations with conversation history).",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The benchmark size of 153 scenarios is described by its construction process but not statistically justified; no power analysis or sample size rationale is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "All results are reported as single point estimates; no variance, standard deviation, or confidence intervals across multiple runs are reported.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Baselines include pass@1 performance without iterative feedback and a conversation-history ablation comparing IaCGen to providing only the latest error without history.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "All six evaluated models (GPT-4o, GPT-o3-mini, Claude-3.5, Claude-3.7, DeepSeek-R1, DeepSeek-V3) are current state-of-the-art models.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "An ablation study comparing IaCGen with and without complete conversation history is conducted using Claude-3.5 (Fig. 7), showing the contribution of the conversation history component.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The paper uses passItr@n for deployability, resource/attribute-level intent matching, and three security compliance metrics (policy pass rate, unfiltered compliance, filtered compliance).",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Human-in-the-loop feedback from a cloud engineer is evaluated in RQ3, and a DevOps expert manually crafted intent specifications for 51 benchmark samples for user intent matching evaluation.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "DPIaC-Eval serves as the held-out test set; LLMs are not fine-tuned on any portion of it and are evaluated zero-shot.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by difficulty level (Fig. 4), error stage (Fig. 8), error type (Table 3), and per-model performance across all metrics (Tables 2, 4, 5).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 6.2 analyzes five specific error categories (Missing Value, Self-defined Property, Null Substitution, Unnecessary Whitespace, Arbitrary Default Value) with per-model failure counts and root cause analysis.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper honestly reports negative findings: only 8.4% filtered security compliance, only 25.2% user intent satisfaction, GPT-4o's substantially lower performance (55.2% vs Claude's 95.5% at passItr@15).",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Model names such as 'Claude-3.5', 'Claude-3.7', 'GPT-4o', 'GPT-o3-mini' are used without specifying exact version identifiers or snapshot dates; the paper only promises these details are in the replication package.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "Full prompts are not included in the paper; the system prompt structure is described but actual prompt text is deferred to the code repository.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Temperature is set to 0 and maximum output token limit of 8,000 is explicitly stated for all model evaluations.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The IaCGen framework is described in detail in Section 4, including the three validation stages (format verification, syntax checking, live deployment) and the feedback allocation strategy (2 general + 4 detailed attempts per stage).",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "The benchmark construction pipeline is documented with specific filtering steps and template counts at each stage: 900→850 (size filtering)→465 (syntax check)→200 (deployment test)→153 (rectification), shown in Fig. 2.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "The DPIaC-Eval benchmark (153 templates and prompts) is available in the replication package's Data folder.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 3.1 describes template sources (AWS documentation, AWS Samples GitHub, GitHub repositories using CloudFormation), ethical licensing checks (MIT, Apache 2.0), and the multi-stage preprocessing pipeline.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants were recruited as study subjects; DevOps practitioners were used for benchmark construction but not as experimental participants.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The complete data pipeline from collection to final benchmark is documented in Section 3.1 and illustrated in Fig. 2, including filtering criteria and counts at each stage.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoffs for the six LLMs are not stated in the paper; the paper only mentions these will be documented in the replication package.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The DPIaC-Eval templates were sourced from publicly available GitHub repositories and AWS documentation that predate the LLMs' training cutoffs; potential overlap is never discussed.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "The benchmark templates are from public GitHub repositories and AWS sample libraries that were almost certainly available before the LLMs' training cutoffs; the paper does not address this contamination risk.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants as experimental subjects; DevOps practitioners were used only for benchmark construction.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human subjects research; ethics mentions relate only to IP licensing of templates.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants as experimental subjects.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants as experimental subjects.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants as experimental subjects.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants as experimental subjects.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants as experimental subjects.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Per-template costs are reported: Claude-3.7-Sonnet $0.42 (most expensive), DeepSeek-V3 $0.04 (cheapest), AWS deployment $0.04 per deployable template.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Total study costs are explicitly stated: $230.75 for LLM API tokens and $35.21 for AWS deployment validation.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Six state-of-the-art LLMs achieve only 20.8–30.2% deployment success rate on the first attempt at IaC template generation.",
    375       "evidence": "Table 2 shows passItr@1 results: GPT-4o 22.7%, GPT-o3-mini 20.8%, Claude-3.5 30.2%, Claude-3.7 26.8%, DeepSeek-R1 22.9%, DeepSeek-V3 24.2%.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "IaCGen achieves 54.6–91.6% deployment success in 10 iterations across all evaluated models.",
    380       "evidence": "Table 2 shows passItr@10: GPT-4o 54.6%, GPT-o3-mini 66.2%, Claude-3.5 91.6%, Claude-3.7 86.9%, DeepSeek-R1 68.0%, DeepSeek-V3 56.9%.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Maintaining complete conversation history reduces required iterations by 15.9% compared to providing only the most recent error.",
    385       "evidence": "Ablation study (Fig. 7) on Claude-3.5 shows IaCGen averages 4.55 iterations vs. baseline's 5.41 iterations to achieve deployable templates.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Human-in-the-loop feedback enables all six models to exceed 90% passItr@25.",
    390       "evidence": "Section 6.3 and Fig. 9 show all models surpass 90% passItr@25 with human feedback; Claude models reach 98%.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Only 25.2% of generated IaC templates fully satisfy user intent at both resource and attribute level.",
    395       "evidence": "Table 4 shows average resource-level matching of 58.8%, attribute-level 40.5%, and combined Resource & Attribute only 25.2% across all models.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "Only 8.4% of generated deployable templates achieve full security compliance when filtered for applicable policies.",
    400       "evidence": "Table 5 shows filtered compliance rates ranging from 6.1% (GPT-4o) to 11.5% (DeepSeek-V3), averaging 8.4%.",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "IaCGen generalizes to Terraform, achieving 100% passItr@7 syntax accuracy with Claude-3.5 on IaC-Eval benchmark.",
    405       "evidence": "Section 6.1 reports 79.7% passItr@1 and 100% passItr@7 on IaC-Eval Terraform benchmark with an average of 1.58 iterations.",
    406       "supported": "weak"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval"
    411   ],
    412   "key_findings": "Current LLMs are poor at generating deployable AWS CloudFormation templates with only 20.8–30.2% first-attempt success, despite reasonable syntactic correctness. The IaCGen iterative feedback framework dramatically improves this to 54.6–91.6% within 10 iterations by simulating real DevOps workflows with progressive validation stages. Security compliance of generated templates is alarmingly low at 8.4% filtered compliance, and user intent matching is weak at 25.2% combined resource-and-attribute satisfaction, indicating that deployability is necessary but far from sufficient for practical utility. Maintaining complete conversation history is more effective than isolated-feedback approaches, as it prevents 'Error Recurrence' where LLMs reintroduce previously corrected mistakes.",
    413   "red_flags": [
    414     {
    415       "flag": "No statistical significance testing",
    416       "detail": "All comparative claims between models and conditions are made without statistical tests, despite clear numerical differences that require significance assessment."
    417     },
    418     {
    419       "flag": "Benchmark contamination unaddressed",
    420       "detail": "DPIaC-Eval templates were sourced from public GitHub repositories and AWS documentation that predate the LLMs' training cutoffs; potential memorization of test templates is never discussed."
    421     },
    422     {
    423       "flag": "User intent evaluation on 51/153 samples",
    424       "detail": "The intent matching evaluation (RQ4) uses only 51 randomly sampled instances from the 153-template benchmark, reducing statistical power for this important finding."
    425     },
    426     {
    427       "flag": "Vague model version identifiers",
    428       "detail": "Model names like 'Claude-3.5' and 'Claude-3.7' are not fully specified in the paper; exact version/snapshot identifiers are deferred to the replication package only."
    429     },
    430     {
    431       "flag": "Single-run results, no variance",
    432       "detail": "All results appear to be from single evaluation runs with no variance reported across runs, despite using temperature=0 which only partially addresses stochasticity."
    433     },
    434     {
    435       "flag": "Terraform generalizability underpowered",
    436       "detail": "Terraform generalizability is tested with only Claude-3.5 and only measures syntax correctness (not deployability), making the generalizability claim much weaker than presented."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "IaC-Eval: A Code Generation Benchmark for Cloud Infrastructure-as-Code Programs",
    442       "relevance": "Primary prior benchmark for LLM IaC generation; DPIaC-Eval is directly compared and extended from this work."
    443     },
    444     {
    445       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    446       "relevance": "Standard code generation benchmark used as reference point; IaC success rates (19–30%) are contrasted with HumanEval rates (~95%)."
    447     },
    448     {
    449       "title": "Teaching Large Language Models to Self-Debug",
    450       "relevance": "Related feedback mechanism approach for code generation; IaCGen extends this concept to IaC with multi-stage deployment feedback."
    451     },
    452     {
    453       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    454       "relevance": "Foundational iterative refinement approach that IaCGen builds upon, extending to deployment-validated IaC generation."
    455     },
    456     {
    457       "title": "Using a Feedback Loop for LLM-based Infrastructure as Code Generation",
    458       "relevance": "Most closely related prior work; IaCGen improves upon it by preserving conversation history and including live deployment validation."
    459     },
    460     {
    461       "title": "RepoCoder: Repository-Level Code Completion through Iterative Retrieval and Generation",
    462       "relevance": "Related iterative feedback approach for code generation that only provides immediate error messages, contrasted with IaCGen's full conversation history approach."
    463     }
    464   ],
    465   "engagement_factors": {
    466     "practical_relevance": {
    467       "score": 3,
    468       "justification": "Directly addresses a pain point for DevOps practitioners — automating CloudFormation template generation with a working framework and public replication package."
    469     },
    470     "surprise_contrarian": {
    471       "score": 2,
    472       "justification": "The finding that syntactic correctness is nearly useless as an IaC quality metric (42.7% of syntactically valid templates fail deployment) challenges how the field has been evaluating LLMs for IaC."
    473     },
    474     "fear_safety": {
    475       "score": 1,
    476       "justification": "The 8.4% security compliance finding is concerning for cloud security practitioners but is framed as a research gap rather than an imminent risk."
    477     },
    478     "drama_conflict": {
    479       "score": 1,
    480       "justification": "Claude vs GPT comparison shows dramatic performance difference (95.5% vs 55.2% passItr@15) that practitioners will notice, but framing is academic rather than dramatic."
    481     },
    482     "demo_ability": {
    483       "score": 2,
    484       "justification": "Code is publicly available on GitHub and the framework can be run against the DPIaC-Eval benchmark, though it requires AWS account setup and API keys."
    485     },
    486     "brand_recognition": {
    487       "score": 1,
    488       "justification": "Authors are from ANU, NTU, and CSIRO — established institutions but not AI lab brand names; venue is FSE, a respected but not top-tier AI conference."
    489     }
    490   },
    491   "hn_data": {
    492     "threads": [],
    493     "top_points": 0,
    494     "total_points": 0,
    495     "total_comments": 0
    496   }
    497 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs