scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25157B)
      1 {
      2   "paper": {
      3     "title": "CloudEval-YAML: A Practical Benchmark for Cloud Configuration Generation",
      4     "authors": [
      5       "Yifei Xu",
      6       "Yuning Chen",
      7       "Xumiao Zhang",
      8       "Xianshang Lin",
      9       "Pan Hu",
     10       "Yunfei Ma",
     11       "Songwu Lu",
     12       "Wan Du",
     13       "Z. Morley Mao",
     14       "Ennan Zhai",
     15       "Dennis Cai"
     16     ],
     17     "year": 2023,
     18     "venue": "arXiv",
     19     "arxiv_id": "2401.06786"
     20   },
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The abstract states: 'We release the dataset along with the evaluation framework at https://github.com/alibaba/CloudEval-YAML.' A GitHub URL is provided."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The dataset of 1011 problems is released as part of the GitHub repository mentioned in the abstract: https://github.com/alibaba/CloudEval-YAML."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper describes the evaluation cluster hardware (64 4-core 8GB machines, Minikube, Docker) but does not provide a requirements.txt, Dockerfile, or detailed environment setup with library versions for reproducing the benchmark framework."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "While the code is released, the paper itself does not contain step-by-step reproduction instructions. The evaluation platform architecture is described at a high level (§3), but specific commands or a reproducibility guide are not provided in the paper."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "All results in Tables 4, 5, and 6 are reported as single point estimates without confidence intervals or error bars. No uncertainty quantification is provided."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper makes multiple comparative claims (e.g., 'Proprietary models outperform open-source models by a large gap') but uses no statistical significance tests. All comparisons are based on raw score differences."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper reports relative differences with baselines, e.g., 'GPT-3.5 and GPT-4 score 0.412 and 0.515, which is 4.84× and 6.06× as that of llama-2-70b-chat' (§4.1), and '20-sample generation could improve the unit test score of Llama-2-70B/PaLM-2/GPT-3.5 by 30%/37%/39%' (§4.2). These provide magnitude context."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The dataset has 1011 problems (337 original + augmented), but there is no justification for why this size is sufficient, no power analysis, and no discussion of whether the sample provides adequate statistical power for the comparisons made."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be single-run evaluations. The multi-sample generation experiment (§4.2) reports pass@k but without variance across runs."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The paper evaluates 12 LLMs against each other (Table 4), including both open-source and proprietary models of varying sizes, providing comparative baselines."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The baselines include GPT-4, GPT-3.5, PaLM-2, Llama-2, Code Llama, and WizardCoder, which were all contemporary state-of-the-art models at the time of submission (late 2023)."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper ablates multiple dimensions: data augmentation effects (Table 5 comparing original/simplified/translated), multi-sample generation (§4.2), few-shot prompting (§4.3), and the unit test prediction classifier (§4.4). These serve as ablation-like analyses of different factors affecting performance."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Six metrics are used across three categories: text-level (BLEU, Edit Distance, Exact Match), YAML-aware (Key-Value Exact Match, Key-Value Wildcard Match), and function-level (Unit Test). Described in §3.2."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "There is no human evaluation of the LLM-generated YAML outputs. All evaluation is automated through the six metrics. Given that the paper claims practical relevance for cloud developers, human evaluation of output quality would be relevant."
     96       },
     97       "held_out_test_set": {
     98         "applies": false,
     99         "answer": false,
    100         "justification": "This is a benchmark paper that evaluates models on the full dataset. There is no model training or tuning on the CloudEval-YAML data that would require a held-out split (except the XGBoost classifier in §4.4, which uses leave-one-model-out cross-validation)."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Figure 6 provides per-category breakdowns across four perspectives: application category (Kubernetes/Envoy/Istio), code context presence, answer length, and question token count. Table 5 breaks down by dataset type."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 4.1 includes a detailed failure mode analysis (Figure 7) that categorizes failures into 6 modes from empty responses to valid YAML that fails unit tests, with per-model breakdowns for GPT-4, Llama-2-70B, and Llama-2-7B."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Several negative results are reported: few-shot prompting 'does not yield significant improvements' (§4.3), code-specific models 'typically perform poorly on CloudEval-YAML compared to general LLMs' (§4.1), and the unit test predictor has high relative error up to 80% (§4.4)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims are supported: the dataset has 1011 problems (Table 2), the evaluation platform achieves 20x speedup (Figure 5), and the paper evaluates 12 LLMs (Table 4). The claims are descriptive and accurately reflected in the results."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper makes implicit causal claims, e.g., 'Simplification of problems generally leads to lower performance' (§4.1) and attributes code LLM poor performance to 'the dataset used in the fine-tuning process' (§4.1). These are observational comparisons presented with causal language but without controlled experiments isolating these factors."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title claims 'Cloud Configuration Generation' broadly, but the benchmark covers only Kubernetes, Envoy, and Istio YAML configurations. Many cloud configuration formats (Terraform, Ansible, CloudFormation, Pulumi, Docker Compose) are not covered. The paper does not explicitly bound its claims to these three tools."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper does not discuss alternative explanations for its findings. For example, the large gap between proprietary and open-source models could be due to training data differences, model size, or RLHF, but these are not explored. The hypothesis about code LLMs' poor performance is stated as speculative ('It may be related to...') without analysis."
    138       }
    139     },
    140     "setup_transparency": {
    141       "model_versions_specified": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "Models are referred to as 'GPT-3.5', 'GPT-4', 'PaLM-2-bison' without specific version strings or snapshot dates. For open-source models, sizes are given (e.g., 'Llama-2-70b-chat', 'Wizardcoder-34b-v1.0') but for proprietary models, no API version or snapshot date is specified."
    145       },
    146       "prompts_provided": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "The full prompt template is provided in Appendix B: 'You are an expert engineer in cloud native development. According to the question, please provide only complete formatted YAML code as output without any description...' The few-shot examples are in Appendix C."
    150       },
    151       "hyperparameters_reported": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "For the multi-sample generation experiment, Llama-2-70B hyperparameters are stated (temperature=0.75, top_p=0.9, top_k=50), but for proprietary models, the paper says 'We leave parameters that control the randomness of the output to default.' The default values for the main benchmark (Table 4) are not specified for any model."
    155       },
    156       "scaffolding_described": {
    157         "applies": false,
    158         "answer": false,
    159         "justification": "No agentic scaffolding is used. The evaluation uses single-turn prompt-response generation from LLMs."
    160       },
    161       "data_preprocessing_documented": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 3.1 documents the post-processing pipeline for extracting YAML from LLM responses, including specific rules for removing content before keywords, extracting delimited blocks. Section 2.2 documents the data augmentation process with GPT-4 + manual review."
    165       }
    166     },
    167     "limitations_and_scope": {
    168       "limitations_section_present": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion (§6) mentions future work directions but does not discuss limitations of the current approach."
    172       },
    173       "threats_to_validity_specific": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No threats to validity are discussed. The paper does not address potential issues such as the limited scope of cloud tools covered, the representativeness of the problem selection, or the reliability of unit tests as the sole functional correctness measure."
    177       },
    178       "scope_boundaries_stated": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its findings to Kubernetes/Envoy/Istio, does not discuss what types of cloud configurations are excluded, and does not note limitations of unit-test-only evaluation."
    182       }
    183     },
    184     "data_integrity": {
    185       "raw_data_available": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "The dataset and evaluation framework are released at https://github.com/alibaba/CloudEval-YAML, which would include the raw problems, reference YAML files, and unit test scripts."
    189       },
    190       "data_collection_described": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Section 2.1 describes the data collection process: problems were hand-picked from official documentation websites, popular StackOverflow issues, and highly-ranked blog posts, with explicit guidelines for selection (clearly defined, diverse, avoiding ambiguity)."
    194       },
    195       "recruitment_methods_described": {
    196         "applies": false,
    197         "answer": false,
    198         "justification": "No human participants were recruited for the evaluation. The data augmentation involved a survey with a cloud provider operations team, but this is dataset construction, not a human subjects study. The benchmark is a standard dataset evaluation."
    199       },
    200       "data_pipeline_documented": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The full pipeline from problem selection (§2.1) through data augmentation (§2.2) to evaluation (§3) is documented, including the statistics at each stage (337 original problems, augmented to 1011). Table 1 shows statistics of the augmentation."
    204       }
    205     },
    206     "conflicts_of_interest": {
    207       "funding_disclosed": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "No funding source or acknowledgments section is present in the paper. Given that 6 of 11 authors are affiliated with Alibaba Cloud, funding disclosure would be expected."
    211       },
    212       "affiliations_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Author affiliations are clearly listed: '1 Alibaba Cloud 2 University of Michigan 3 UCLA 4 UC Merced'. The Alibaba Cloud affiliation is prominently stated."
    216       },
    217       "funder_independent_of_outcome": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "Six authors are from Alibaba Cloud, a major cloud provider. The benchmark evaluates LLM capabilities for cloud configuration generation, which is directly relevant to Alibaba's business interests. This creates a potential conflict of interest that is not acknowledged."
    221       },
    222       "financial_interests_declared": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No competing interests statement or financial interests declaration is present in the paper. Alibaba Cloud authors may have financial interests in demonstrating the need for better cloud configuration tools."
    226       }
    227     },
    228     "contamination": {
    229       "training_cutoff_stated": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No training data cutoff dates are stated for any of the 12 models evaluated. This is relevant because some problems are derived from StackOverflow and documentation that may be in training data."
    233       },
    234       "train_test_overlap_discussed": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The paper does not discuss whether problems sourced from official documentation or StackOverflow could have been seen by the models during training. This is a significant concern given that the problems are drawn from public online sources."
    238       },
    239       "benchmark_contamination_addressed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "Problems are sourced from 'official documentation websites, popular issues from StackOverflow, and highly-ranked blog posts' — all public sources likely in LLM training data. The paper does not discuss this contamination risk at all."
    243       }
    244     },
    245     "human_studies": {
    246       "pre_registered": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants were studied. The paper is a benchmark evaluation of LLMs."
    250       },
    251       "irb_or_ethics_approval": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants were studied. The paper is a benchmark evaluation of LLMs."
    255       },
    256       "demographics_reported": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants were studied. The paper is a benchmark evaluation of LLMs."
    260       },
    261       "inclusion_exclusion_criteria": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants were studied. The paper is a benchmark evaluation of LLMs."
    265       },
    266       "randomization_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants were studied. The paper is a benchmark evaluation of LLMs."
    270       },
    271       "blinding_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants were studied. The paper is a benchmark evaluation of LLMs."
    275       },
    276       "attrition_reported": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants were studied. The paper is a benchmark evaluation of LLMs."
    280       }
    281     },
    282     "cost_and_practicality": {
    283       "inference_cost_reported": {
    284         "applies": true,
    285         "answer": true,
    286         "justification": "Table 3 provides a detailed cost breakdown: GPT-3.5 inference costs $0.60, Llama-7b costs $2.90. Section 3.4 discusses per-1k-token costs. Section 4.2 discusses cost tradeoffs between multi-sample GPT-3.5 vs. single-sample GPT-4."
    287       },
    288       "compute_budget_stated": {
    289         "applies": true,
    290         "answer": true,
    291         "justification": "Table 3 reports cloud evaluation costs ($0.71-$5.51 depending on instance type). The evaluation cluster specs are described: 64 4-core 8GB machines. Total benchmark running cost ranges from $1.31 to $8.41 per run."
    292       }
    293     }
    294   },
    295   "claims": [
    296     {
    297       "claim": "Proprietary models (GPT-3.5, GPT-4) outperform open-source models by a much larger gap on CloudEval-YAML than on HumanEval.",
    298       "evidence": "Table 4 shows GPT-4 unit test score of 0.515 vs Llama-2-70b-chat at 0.085 (6.06x gap). On HumanEval the gap is only 2.24x (§4.1).",
    299       "supported": "strong"
    300     },
    301     {
    302       "claim": "Dedicated code generation models perform poorly on cloud configuration compared to general LLMs of similar or smaller size.",
    303       "evidence": "Table 4: wizardcoder-34b-v1.0 scores 0.056 unit test, while llama-2-13b-chat scores 0.067 with less than half the model size (§4.1).",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "Multi-sample generation can improve unit test scores by 30-39% and make cheaper models cost-competitive with expensive ones.",
    308       "evidence": "Figure 8 and §4.2: 20-sample generation improves Llama-2-70B/PaLM-2/GPT-3.5 by 30%/37%/39%. GPT-3.5 with 6 samples beats GPT-4 with 1 sample at 30x lower cost.",
    309       "supported": "moderate"
    310     },
    311     {
    312       "claim": "Few-shot prompting does not yield significant improvements in cloud configuration generation.",
    313       "evidence": "Table 6: GPT-3.5 improves from 142 to 154 (+12) with 3 shots, but Llama-2-70b-chat decreases from 30 to 29 (-1). Results are inconsistent across models (§4.3).",
    314       "supported": "moderate"
    315     },
    316     {
    317       "claim": "The scalable evaluation platform achieves 20x speedup over single-machine evaluation.",
    318       "evidence": "Figure 5 shows 10.3 hours on single machine vs ~0.5 hours with 64 workers and caching, a 20x improvement (§3.3).",
    319       "supported": "strong"
    320     },
    321     {
    322       "claim": "CloudEval-YAML is the first hand-written dataset targeting cloud-native applications.",
    323       "evidence": "Table 7 compares against 15 other benchmarks. None target cloud-native YAML configuration. The closest is Ansible-YAML but it is scraped from GitHub/GitLab, not hand-written (§5).",
    324       "supported": "moderate"
    325     }
    326   ],
    327   "methodology_tags": [
    328     "benchmark-eval"
    329   ],
    330   "key_findings": "CloudEval-YAML is a benchmark of 1011 hand-written YAML configuration generation problems for cloud-native applications (Kubernetes, Envoy, Istio), with unit tests for functional correctness evaluation. An evaluation of 12 LLMs shows proprietary models (GPT-4: 51.5% pass rate) dramatically outperform open-source models (best open-source Llama-2-70b: 8.5%), with a gap much larger than seen on general-purpose code benchmarks like HumanEval. Multi-sample generation is found to be an effective strategy, with GPT-3.5 at 6 samples matching GPT-4's single-sample performance at 1/30th the cost. Few-shot prompting showed no significant benefit for this domain.",
    331   "red_flags": [
    332     {
    333       "flag": "No contamination analysis",
    334       "detail": "Problems are sourced from official documentation, StackOverflow, and blog posts — all likely in LLM training data. The paper does not discuss whether models may have seen these exact problems during training, which could inflate scores for models trained on more web data."
    335     },
    336     {
    337       "flag": "No limitations section",
    338       "detail": "The paper has no dedicated limitations or threats-to-validity section. For a benchmark paper, this is concerning — issues like benchmark scope, unit test reliability, and problem representativeness should be discussed."
    339     },
    340     {
    341       "flag": "No statistical rigor in comparisons",
    342       "detail": "All model comparisons are based on single-run point estimates without confidence intervals, significance tests, or variance reporting. Claims like 'proprietary models outperform open-source by a large gap' lack statistical support."
    343     },
    344     {
    345       "flag": "Potential conflict of interest",
    346       "detail": "6 of 11 authors are from Alibaba Cloud, a major cloud provider with business interest in cloud configuration tooling. No conflicts of interest are disclosed, and no funding statement is provided."
    347     },
    348     {
    349       "flag": "Overly broad title relative to scope",
    350       "detail": "The title claims 'Cloud Configuration Generation' but the benchmark covers only Kubernetes, Envoy, and Istio YAML. Many important cloud configuration formats (Terraform, CloudFormation, Ansible, Pulumi, Docker Compose) are not included."
    351     }
    352   ],
    353   "cited_papers": [
    354     {
    355       "title": "Evaluating large language models trained on code",
    356       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    357       "year": 2021,
    358       "arxiv_id": "2107.03374",
    359       "relevance": "Introduces HumanEval, the most widely used LLM code generation benchmark, which CloudEval-YAML directly compares against."
    360     },
    361     {
    362       "title": "Program synthesis with large language models",
    363       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    364       "year": 2021,
    365       "arxiv_id": "2108.07732",
    366       "relevance": "Introduces MBPP benchmark for Python code generation, another key comparison point for code LLM evaluation."
    367     },
    368     {
    369       "title": "Llama 2: Open foundation and fine-tuned chat models",
    370       "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"],
    371       "year": 2023,
    372       "arxiv_id": "2307.09288",
    373       "relevance": "Describes Llama 2, one of the primary open-source LLMs evaluated in the benchmark."
    374     },
    375     {
    376       "title": "Code llama: Open foundation models for code",
    377       "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"],
    378       "year": 2023,
    379       "arxiv_id": "2308.12950",
    380       "relevance": "Describes Code Llama, a code-specialized LLM that surprisingly underperforms general LLMs on cloud configuration tasks."
    381     },
    382     {
    383       "title": "WizardCoder: Empowering code large language models with evol-instruct",
    384       "authors": ["Ziyang Luo", "Can Xu", "Pu Zhao"],
    385       "year": 2023,
    386       "arxiv_id": "2306.08568",
    387       "relevance": "Describes WizardCoder, another code-specialized LLM evaluated in the benchmark that underperforms general LLMs."
    388     },
    389     {
    390       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    391       "authors": ["Priyan Vaithilingam", "Tianyi Zhang", "Elena L. Glassman"],
    392       "year": 2022,
    393       "relevance": "Studies the usability and productivity impact of LLM-based code generation tools, relevant to the practical motivation of the benchmark."
    394     },
    395     {
    396       "title": "PaLM 2 technical report",
    397       "authors": ["Rohan Anil", "Andrew M. Dai", "Orhan Firat"],
    398       "year": 2023,
    399       "arxiv_id": "2305.10403",
    400       "relevance": "Describes PaLM 2, one of the proprietary LLMs evaluated in the benchmark."
    401     },
    402     {
    403       "title": "DS-1000: A natural and reliable benchmark for data science code generation",
    404       "authors": ["Yuhang Lai", "Chengxi Li", "Yiming Wang"],
    405       "year": 2023,
    406       "relevance": "A domain-specific code generation benchmark (data science) with unit tests, sharing design philosophy with CloudEval-YAML."
    407     },
    408     {
    409       "title": "Automated code generation for information technology tasks in YAML through large language models",
    410       "authors": ["Saurabh Pujar", "Luca Buratti", "Xiaojun Guo"],
    411       "year": 2023,
    412       "arxiv_id": "2305.02783",
    413       "relevance": "Focuses on Ansible YAML code generation with LLMs, the most closely related prior work to CloudEval-YAML."
    414     },
    415     {
    416       "title": "Measuring coding challenge competence with APPS",
    417       "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"],
    418       "year": 2021,
    419       "arxiv_id": "2105.09938",
    420       "relevance": "Large-scale code generation benchmark with unit tests, relevant to the evaluation methodology of LLM coding capability."
    421     }
    422   ]
    423 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs