scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30861B)
      1 {
      2   "paper": {
      3     "title": "Deployability-Centric Infrastructure-as-Code Generation: Fail, Learn, Refine, and Succeed through LLM-Empowered DevOps Simulation",
      4     "authors": [
      5       "Tianyi Zhang",
      6       "Shidong Pan",
      7       "Zejun Zhang",
      8       "Zhenchang Xing",
      9       "Xiaoyu Sun"
     10     ],
     11     "year": 2025,
     12     "venue": "FSE",
     13     "arxiv_id": "2506.05623",
     14     "doi": "10.48550/arXiv.2506.05623"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "State-of-the-art LLMs achieve only 20.8–30.2% deployment success on first attempt for IaC generation, but the IaCGen iterative feedback framework raises this to 54.6–91.6% within 10 iterations across 6 models. Maintaining full conversation history reduces required iterations by 15.9% by preventing error recurrence. Despite high deployment rates, trustworthiness remains poor: only 25.2% of generated templates satisfy both resource and attribute user intent, and only 8.4% pass security compliance checks.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Replication package released at https://github.com/Tianyi2/IaCGen (Section 9, Data Availability): 'The Code folder contains the code of our IaCGen framework.'"
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The replication package includes the benchmark: 'The Data folder contains the benchmarks' (Section 9, Data Availability)."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper states temperature=0 and 8000 max tokens, and mentions yamllint, cfn-lint, boto3, but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "The replication package contains READMEs: 'Detailed descriptions of files can be found in the README.md file within each folder' (Section 9). The repository URL is provided."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results in Tables 2, 4, and 5 are reported as point estimates with no confidence intervals, error bars, or ± notation."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Claims of model differences (e.g., 'Claude models show impressive performance') are based on comparing raw percentages without any statistical significance tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Effect sizes are reported with baseline context throughout: '≈24.7% passItr@1 to ≈74.8% passItr@15, a near 200% performance improvement' (Section 6.1), '15.9% reduction in required iterations' (Section 6.1), and percentage point gains for each feedback level."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No justification is given for the final benchmark size of 153 scenarios. The 51-instance subsample for user intent evaluation is described as 'randomly sampled' but no power analysis or sample size justification is provided."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "All experiments use temperature=0 for deterministic output and appear to be single-run. No variance, standard deviation, or spread measures are reported across experimental runs."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Multiple baselines: passItr@1 (no iteration), IaCGen without conversation history (Fig. 7), general feedback only vs. detailed feedback (Fig. 6), and comparison with IaC-Eval benchmark (Section 6.1)."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All six evaluated models are contemporary: GPT-4o, GPT-o3-mini, Claude-3.5, Claude-3.7, DeepSeek-R1, and DeepSeek-V3, representing state-of-the-art at time of writing."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Ablation on conversation history (Fig. 7, Section 6.1): IaCGen with vs. without full conversation history using Claude-3.5. Also, decomposition of feedback levels (no feedback / general / detailed) in Fig. 6."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Multiple metrics: passItr@n at various n (deployment success), user intent matching at resource and attribute levels (Table 4), three security compliance metrics (Table 5), and error stage distribution (Fig. 8)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "All evaluation of system outputs is automated: deployment pass/fail via boto3, user intent matching via Checkov custom policies, security via Checkov. The human-in-the-loop component (RQ3) provides feedback during generation, not evaluation of outputs."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "All 153 benchmark scenarios are used for evaluation with no explicit train/dev/test separation. The framework design (feedback structure, iteration limits) was developed with knowledge of the benchmark, though iteration limits are justified by prior work [22]."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Extensive breakdowns: by difficulty level (Fig. 4), by model (Tables 2, 4, 5), by error stage (Fig. 8), by error type (Table 3), and by feedback level (Fig. 6)."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "RQ2 (Section 6.2) provides detailed failure analysis: five primary error categories (Missing Value, Self-defined Property, Null Substitution, Unnecessary Whitespace, Arbitrary Default Value) with counts per model and explanations."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Several negative results reported: poor initial pass rates (≈24.7%), poor user intent alignment (25.2% combined coverage), very low security compliance (8.4%), and GPT-4o's poor deployment error resolution."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims match results: 20.8–30.2% first-attempt rates (Table 2 passItr@1), 54.6–91.6% at 10 iterations (Table 2 passItr@10), >90% passItr@25 with human feedback (Fig. 9), 25.2% user requirement coverage (Table 4), 8.4% security compliance (Table 5)."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The main causal claim (IaCGen's iterative feedback improves deployment success) is supported by controlled ablation: with vs. without conversation history (Fig. 7), and decomposition by feedback level (Fig. 6). These are adequate single-variable manipulations."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "Title ('Infrastructure-as-Code Generation') and abstract frame claims broadly, but the study is primarily CloudFormation-specific with only a small Terraform syntax-only extension. Section 7.4 acknowledges this but the title and abstract overclaim the generality."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "Section 7.4 discusses threats (model evolution, benchmark scope, difficulty categorization) but these are generalizability concerns, not alternative explanations for the observed results. For example, whether similar improvement could come from simple retrying without structured feedback, or from prompt-only approaches, is not discussed."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper explicitly acknowledges that deployment success (the primary metric) does not capture full IaC quality, and supplements it with user intent matching (Section 6.4) and security compliance analysis. This distinction between deployment success and overall template utility is clearly stated."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Models are identified only by marketing names: 'GPT-4o, GPT-o3-mini, Claude-3.5, Claude-3.7, DeepSeek-R1, and DeepSeek-V3' (Section 5). No API version strings or snapshot dates. Section 7.4 notes 'we document the specific model and their cut-off date in our replication package' but the paper itself does not specify versions."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The paper describes prompt structure ('We need a CloudFormation template that creates [description]...') and states 'The complete prompts and other reproducible details mentioned in this section are available in our code repository' (Section 4.1) at https://github.com/Tianyi2/IaCGen."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 5: 'All models are evaluated with consistent temperature settings of 0 to maximize deterministic outputs and configured with the 8,000 maximum output token limit.' Iteration limits (2 general + 4 detailed feedback) also specified."
    159       },
    160       "scaffolding_described": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "IaCGen framework described in detail in Section 4 with workflow diagram (Fig. 5), three-stage validation pipeline, conversation history management, feedback mechanism design, and iteration logic."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 3.1 documents the full preprocessing pipeline with counts at each stage: ~900 templates → 850 (size filtering) → 465 (syntax validation) → 200 (deployment testing) → 153 (rectification). Filtering criteria at each stage are described."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 7.4 'Threats to Validity' provides a substantive discussion of limitations across multiple paragraphs."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 7.4 identifies threats specific to this study: models available at time of writing may yield different results, 153 scenarios may not capture 'highly specialized configurations', difficulty categorization 'may not align perfectly with all organizational perspectives', and absence of standardized benchmarks across IaC languages."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 7.4 explicitly states boundaries: focus on CloudFormation, 153 scenarios across 58 services, specific model set, and acknowledged limitation to standardized benchmarks. Section 2.2 also explicitly explains the focus on CloudFormation over Terraform for methodological reasons."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The replication package at https://github.com/Tianyi2/IaCGen contains the benchmark data and framework code (Section 9, Data Availability)."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 3.1 describes template sourcing from AWS documentation, AWS Samples GitHub, and a CloudFormation repository dataset [38], with ethical licensing checks and multi-stage preprocessing."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "The paper mentions 'one DevOps practitioner with three years of AWS and CloudFormation experience' and 'two additional DevOps practitioners with equivalent expertise' for benchmark construction, but does not describe how these practitioners were recruited or selected."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Full pipeline documented in Section 3.1: ~900 initial templates → 850 after size filtering → 465 after syntax validation → 200 after deployment testing → 153 after rectification. Each stage's criteria are described."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding source, grants, or acknowledgments section is present in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "All author affiliations are clearly listed: Australian National University, NYU & Columbia University, Nanyang Technological University, and CSIRO's Data61. None are affiliated with the companies whose LLMs are evaluated."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "Cannot be determined since no funding source is disclosed. Authors are at academic institutions not affiliated with evaluated LLM providers, suggesting likely independence, but no explicit disclosure."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No model training cutoff dates are stated in the paper. Section 7.4 mentions 'we document the specific model and their cut-off date in our replication package' but this information is not in the paper itself."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "Templates are sourced from public AWS documentation and GitHub repositories, which are likely in LLM training data. No discussion of potential overlap between these public templates and model training data."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "The benchmark templates originate from public AWS CloudFormation sample templates [36] and GitHub repositories [37, 38], which are highly likely to appear in LLM training corpora. This contamination risk is not discussed."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in the study. The DevOps practitioners involved in benchmark construction and the cloud engineer providing feedback are part of the methodology design, not study participants."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants study. This is a benchmark evaluation of LLMs."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Section 5 reports total API cost ($230.75), per-model costs (Claude-3.7 at $0.42/template, DeepSeek-V3 at $0.04/template), and deployment cost ($0.04/template). Total minimum cost of $0.08 per deployable template is stated."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Section 5 states total LLM API cost ($230.75) and AWS deployment cost ($35.21), totaling $265.96 for the full evaluation. Per-template costs are broken down by model."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "Temperature is set to 0 for deterministic output, but no multi-run analysis is performed to verify determinism across API calls or assess sensitivity to non-deterministic factors."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The paper implies single runs via temperature=0 but never explicitly states the number of experimental runs per configuration."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No hyperparameter search is described. The iteration limit allocation (2 general + 4 detailed) is justified by prior work [22] but no systematic search over framework parameters is reported."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "Framework configuration (feedback structure, iteration limits) is fixed based on prior work [22] but no systematic comparison of alternative configurations is provided to justify that the chosen design is optimal."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The paper compares 6 models across multiple metrics and difficulty levels, making numerous implicit comparisons, but performs no statistical tests and therefore no multiple comparison correction."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors developed both the DPIaC-Eval benchmark and IaCGen framework, then evaluate the framework on their own benchmark. No acknowledgment of author-evaluation bias or independent evaluation."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "Per-model API costs are reported (Section 5), but performance is not analyzed as a function of compute budget. Claude-3.7 ($0.42/template) vs DeepSeek-V3 ($0.04/template) represents a 10x cost difference but cost-performance tradeoff is not discussed."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "The paper explicitly addresses construct validity by showing deployment success alone is insufficient: 42.7% of syntactically correct templates fail deployment (Section 3.3), and RQ4 demonstrates deployable templates still fail user intent (25.2%) and security (8.4%) tests, motivating multi-dimensional evaluation."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "The same IaCGen framework is used consistently across all 6 model comparisons, controlling for scaffold differences. Model comparisons are within the same scaffold."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "Benchmark templates are sourced from public AWS documentation and GitHub repositories published before all evaluated models' training. No discussion of whether models may have seen these templates or similar ones during training."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the natural language prompts or error messages in the iterative loop provide information that constitutes leakage beyond what would be available in real-world usage."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "Multiple benchmark templates share AWS service families and structural patterns. No discussion of whether templates from the same service are structurally similar enough to inflate performance estimates."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No concrete leakage detection methods (canary strings, membership inference, n-gram overlap analysis, decontamination) are applied despite the benchmark being sourced from publicly available templates."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "State-of-the-art LLMs achieve only 20.8–30.2% deployment success rate on first attempt for IaC generation",
    371       "evidence": "Table 2 passItr@1 results across 6 models on the 153-scenario DPIaC-Eval benchmark (Section 6.1)",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "IaCGen achieves 54.6–91.6% deployment success within 10 iterations across all evaluated models",
    376       "evidence": "Table 2 passItr@10 results for all 6 models; Claude-3.5 achieves 91.6%, GPT-4o 54.6% (Section 6.1)",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Maintaining complete conversation history reduces required iterations by 15.9% compared to providing only current error",
    381       "evidence": "Ablation study on Claude-3.5 (Fig. 7, Section 6.1): 4.55 average iterations vs 5.41 baseline",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Human-in-the-loop feedback helps all 6 models achieve over 90% passItr@25",
    386       "evidence": "Fig. 9 showing solid lines (with human feedback) exceeding 90% for all models at 25 iterations (Section 6.3)",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Only 25.2% of generated IaC templates fully satisfy both resource-level and attribute-level user intent",
    391       "evidence": "Table 4 Resource & Attribute column, averaged across all models on 51 sampled instances (Section 6.4)",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Only 8.4% of deployable templates achieve full security compliance",
    396       "evidence": "Table 5 Filtered Compliance column, using Checkov security policy checks (Section 6.4)",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "IaCGen generalizes to Terraform, achieving 100% passItr@7 syntax accuracy",
    401       "evidence": "Section 6.1: Claude-3.5 on IaC-Eval Terraform benchmark achieves 79.7% passItr@1 and 100% passItr@7 with average 1.58 iterations",
    402       "supported": "weak"
    403     }
    404   ],
    405   "red_flags": [
    406     {
    407       "flag": "No statistical tests or uncertainty quantification",
    408       "detail": "All model comparisons are based on point estimates from single deterministic runs. No significance tests, confidence intervals, or variance measures are reported despite making explicit performance comparisons between models (e.g., 'Claude models show impressive performance')."
    409     },
    410     {
    411       "flag": "Benchmark contamination risk from public sources",
    412       "detail": "Templates are sourced from public AWS documentation and GitHub repositories that are highly likely in LLM training data. Models may have memorized similar or identical templates. This fundamental threat to validity is not discussed."
    413     },
    414     {
    415       "flag": "Self-evaluation bias",
    416       "detail": "Authors developed both the DPIaC-Eval benchmark and IaCGen framework, then evaluate IaCGen on their own benchmark without independent validation or acknowledgment of potential bias."
    417     },
    418     {
    419       "flag": "Small subsample for trustworthiness evaluation",
    420       "detail": "User intent matching (RQ4) is evaluated on only 51 randomly sampled instances from 153, without justification for the subsample size or analysis of whether this subsample is representative."
    421     },
    422     {
    423       "flag": "Conversation history ablation on single model only",
    424       "detail": "The key ablation study (with vs. without conversation history) is conducted only on Claude-3.5, the best-performing model. Results may not generalize to weaker models where conversation management could have different effects."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Evaluating large language models trained on code",
    430       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    431       "year": 2021,
    432       "arxiv_id": "2107.03374",
    433       "relevance": "Introduced HumanEval benchmark and pass@k metric for LLM code generation, used as comparison baseline."
    434     },
    435     {
    436       "title": "Iac-eval: A code generation benchmark for cloud infrastructure-as-code programs",
    437       "authors": ["Patrick T Kon", "Jiachen Liu", "Yiming Qiu"],
    438       "year": 2024,
    439       "relevance": "First IaC generation benchmark (Terraform), direct predecessor to DPIaC-Eval; found LLMs achieve <20% success."
    440     },
    441     {
    442       "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation",
    443       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    444       "year": 2023,
    445       "relevance": "Rigorous evaluation methodology for LLM-generated code quality and correctness."
    446     },
    447     {
    448       "title": "Using a feedback loop for llm-based infrastructure as code generation",
    449       "authors": ["Mayur Amarnath Palavalli", "Mark Santolucito"],
    450       "year": 2024,
    451       "arxiv_id": "2411.19043",
    452       "relevance": "Examined feedback loops for IaC generation, found effectiveness diminishes after 5 iterations; direct related work."
    453     },
    454     {
    455       "title": "Teaching large language models to self-debug",
    456       "authors": ["Xinyun Chen", "Maxwell Lin", "Nathanael Schärli", "Denny Zhou"],
    457       "year": 2023,
    458       "arxiv_id": "2304.05128",
    459       "relevance": "Foundational work on using natural language feedback for iterative LLM code refinement."
    460     },
    461     {
    462       "title": "Self-refine: Iterative refinement with self-feedback",
    463       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    464       "year": 2023,
    465       "relevance": "Iterative self-refinement framework for LLMs, methodological ancestor to IaCGen's feedback approach."
    466     },
    467     {
    468       "title": "Deepseek-coder: When the large language model meets programming–the rise of code intelligence",
    469       "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"],
    470       "year": 2024,
    471       "arxiv_id": "2401.14196",
    472       "relevance": "One of the evaluated model families (DeepSeek); demonstrates open-source LLM code generation capabilities."
    473     },
    474     {
    475       "title": "A survey of using large language models for generating infrastructure as code",
    476       "authors": ["Kalahasti Ganesh Srivatsa", "Sabyasachi Mukhopadhyay", "Ganesh Katrapati", "Manish Shrivastava"],
    477       "year": 2024,
    478       "arxiv_id": "2404.00227",
    479       "relevance": "Survey covering the landscape of LLM-based IaC generation approaches and challenges."
    480     },
    481     {
    482       "title": "Automated code generation for information technology tasks in yaml through large language models",
    483       "authors": ["Saurabh Pujar", "Luca Buratti", "Xiaojie Guo"],
    484       "year": 2023,
    485       "relevance": "Early work on LLM-based YAML/IaC code generation for IT automation tasks."
    486     },
    487     {
    488       "title": "Optimizing service deployments with nlp based infrastructure code generation-an automation framework",
    489       "authors": ["Hariharan Ragothaman", "Saai Krishnan Udayakumar"],
    490       "year": 2024,
    491       "relevance": "Attempted to address IaC deployability but with limited technical detail; direct related work to IaCGen."
    492     },
    493     {
    494       "title": "When ai takes the wheel: Security analysis of framework-constrained program generation",
    495       "authors": ["Yue Liu", "Zhenchang Xing", "Shidong Pan", "Chakkrit Tantithamthavorn"],
    496       "year": 2025,
    497       "arxiv_id": "2510.16823",
    498       "relevance": "Security analysis of LLM-generated code in constrained frameworks, related to the security compliance findings."
    499     },
    500     {
    501       "title": "A 3-codgen: A repository-level code generation framework for code reuse with local-aware, global-aware, and third-party-library-aware",
    502       "authors": ["Dianshu Liao", "Shidong Pan", "Xiaoyu Sun"],
    503       "year": 2024,
    504       "relevance": "Repository-level LLM code generation framework with context-aware approaches relevant to agentic code generation."
    505     }
    506   ],
    507   "engagement_factors": {
    508     "practical_relevance": {
    509       "score": 2,
    510       "justification": "IaCGen addresses a real DevOps pain point and the replication package is released, but it's a research framework rather than a production-ready tool."
    511     },
    512     "surprise_contrarian": {
    513       "score": 1,
    514       "justification": "The finding that iterative feedback helps is expected; the mildly surprising element is how poorly LLMs perform on first-attempt deployment (20–30%) despite strong general code generation abilities."
    515     },
    516     "fear_safety": {
    517       "score": 1,
    518       "justification": "The 8.4% security compliance rate raises concerns about deploying LLM-generated infrastructure, but this is an expected finding rather than a novel attack or existential concern."
    519     },
    520     "drama_conflict": {
    521       "score": 0,
    522       "justification": "No controversy or confrontational claims; the paper is a straightforward benchmark and framework evaluation."
    523     },
    524     "demo_ability": {
    525       "score": 2,
    526       "justification": "Code and benchmark are released on GitHub, and a practitioner with AWS access could run the framework, though it requires AWS credentials and API keys."
    527     },
    528     "brand_recognition": {
    529       "score": 1,
    530       "justification": "Evaluates well-known models (GPT-4o, Claude, DeepSeek) but the paper itself is from academic institutions without major brand recognition."
    531     }
    532   }
    533 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs