scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (17778B)
      1 {
      2   "paper": {
      3     "title": "A Case for Learned Cloud Emulators",
      4     "authors": ["Archit Bhatnagar", "Yiming Qiu", "Sarah McClure", "Sylvia Ratnasamy", "Ang Chen"],
      5     "year": 2025,
      6     "venue": "HotNets '25 (24th ACM Workshop on Hot Topics in Networks)",
      7     "doi": "10.1145/3696348.3696903"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive is provided in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset or benchmark traces are released."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment details, dependency specifications, or library versions are provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No reproduction instructions or README with commands are provided."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results in Fig. 3 are point estimates (accuracy percentages) with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims its SM emulator outperforms D2C (3/12 vs higher accuracy) but provides no statistical tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Only raw accuracy percentages are reported (e.g., 100% vs 50%) without formal effect size measures or sufficient baseline context."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The evaluation uses only 4 traces across 3 scenarios (12 total test cases) with no justification for why this sample size is sufficient."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or multi-run results are reported. Results appear to be single-run."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Two baselines are included: LocalStack (manually-engineered emulator) and a direct-to-code (D2C) LLM baseline using Gemini 2.5 Pro (Section 5)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "LocalStack is the state-of-the-art cloud emulator, and D2C uses Gemini 2.5 Pro, a contemporary model."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is provided to isolate the contribution of individual components (SM abstraction, alignment phase, constrained decoding, etc.)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper evaluates API coverage (Table 1) and response alignment accuracy across scenarios (Fig. 3), plus service complexity metrics (Fig. 4)."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a systems paper about cloud emulation; human evaluation of outputs is not relevant to the claims."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The 4 test traces are described but there is no indication of a held-out test set or separation from development traces."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Fig. 3 breaks down accuracy by scenario type (Provisioning, Edge Cases, State Updates) and Table 1 breaks down API coverage by service."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 5 discusses specific failure cases of the D2C baseline (state errors, transition errors) and Section 6 discusses limitations of the proposed approach."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6 acknowledges limitations: underspecified documentation issues, lack of alignment completeness guarantees, and reliance on alignment phase for underspecified APIs."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims the task is 'amenable to AI automation' and reports 'preliminary findings,' which aligns with the modest evaluation in Section 5."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper claims the SM abstraction 'prevents' logic errors 'by design' compared to D2C, but the evaluation is too small (12 traces) to adequately support this causal claim."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper tests on 3 AWS services and briefly mentions Azure, but the title and abstract suggest general 'cloud emulators' without bounding claims to the tested subset."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations. The D2C baseline's poor performance could be due to prompt design rather than fundamental limitations, but this is not considered."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper mentions 'Gemini 2.5 Pro' but provides no snapshot date or API version identifier."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No prompts are provided. The paper describes prompting in natural language ('prompted to generate the emulation logic') without actual prompt text."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for LLM usage."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The workflow is described in detail in Section 4: documentation wrangling, specification extraction with incremental generation, consistency checks, and automated alignment (Fig. 2)."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.1 describes the documentation wrangling step, including how AWS PDFs are parsed by resource names and API signatures."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6 'Challenges & Limitations' discusses underspecified documentation and alignment completeness."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 6 discusses specific threats: underspecified documentation causing reliance on alignment, and inability to guarantee alignment completeness due to unbounded program space."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. The evaluation is on 3 AWS services with 12 traces but no explicit bounding of what this does not demonstrate."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data (traces, API responses, generated SM specs) are available for verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The paper describes that cloud documentation (AWS PDFs, Azure web pages) serves as input, and test traces are constructed for 3 scenarios (provisioning, state updates, edge cases)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; data source is cloud documentation (a standard public resource)."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is described in Section 4: documentation wrangling → specification extraction → consistency checks → alignment. Fig. 2 illustrates the workflow."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgments section lists VMware Early Career Faculty Grant, Cisco grant, and NSF grants CNS-1942219, CNS-2106751, CNS-2107147, CNS-2214272."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: University of Michigan, University of Hong Kong, UC Berkeley. No evaluated product is affiliated with these institutions."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funders (VMware, Cisco, NSF) do not have a direct financial stake in the specific outcome of cloud emulator evaluation results."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It uses an LLM as a code generation tool, not as a test subject."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Same as above — the LLM is a tool for code synthesis, not the subject of benchmark evaluation."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Same as above — no benchmark evaluation of model knowledge."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper mentions 'code synthesis only took a couple of minutes' but does not report API costs, token counts, or systematic latency measurements."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget, API spend, or hardware specifications are provided."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The SM-based emulator achieves 100% API coverage for EC2, DynamoDB, and Network Firewall services, compared to LocalStack/Moto's partial coverage (31%, 68%, 11% respectively).",
    286       "evidence": "Table 1 shows API coverage comparison. Section 5 states the prototype 'captures all 45 API calls' for Network Firewall.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "The SM-based emulator achieves higher response alignment accuracy than the direct-to-code (D2C) baseline across all three scenario types.",
    291       "evidence": "Fig. 3 shows SM Emu at 100%/75%/100% vs D2C at 50%/25%/0% for Provisioning/Edge Cases/State Updates. D2C aligned in only 3 out of 12 traces.",
    292       "supported": "weak"
    293     },
    294     {
    295       "claim": "The SM abstraction prevents critical logic and state manipulation errors by design that the D2C approach is prone to.",
    296       "evidence": "Section 5 discusses specific D2C failures: missing state variables, missing state checks (e.g., DeleteVPC without checking gateways), and silent failures on invalid operations.",
    297       "supported": "weak"
    298     }
    299   ],
    300   "methodology_tags": ["case-study", "benchmark-eval"],
    301   "key_findings": "This workshop paper proposes using LLMs to automatically generate cloud emulators by extracting state machine specifications from cloud documentation, rather than hand-crafting emulation logic. A preliminary prototype using Gemini 2.5 Pro achieves 100% API coverage for three AWS services compared to LocalStack/Moto's 32% average coverage. In a small evaluation (12 traces across 3 scenarios), the SM-based approach shows higher fidelity than direct LLM code generation, which aligned on only 3/12 traces due to state and transition errors.",
    302   "red_flags": [
    303     {
    304       "flag": "Extremely small evaluation",
    305       "detail": "The core accuracy comparison uses only 4 traces across 3 scenarios (12 total test cases). This is far too small to support claims about systematic advantages of the SM approach over D2C."
    306     },
    307     {
    308       "flag": "No reproducibility artifacts",
    309       "detail": "No code, data, prompts, or environment details are released, making it impossible to verify or reproduce the results."
    310     },
    311     {
    312       "flag": "API coverage metric is misleading",
    313       "detail": "100% API coverage means the emulator has stubs for all APIs, but coverage does not imply correctness. The accuracy evaluation showing 75% on edge cases suggests significant gaps in behavioral fidelity."
    314     },
    315     {
    316       "flag": "Unfair D2C baseline comparison",
    317       "detail": "The D2C baseline uses direct prompting without RAG, while the SM approach benefits from structured documentation wrangling and iterative refinement. The comparison may reflect prompt engineering quality rather than fundamental approach differences."
    318     }
    319   ],
    320   "cited_papers": [
    321     {
    322       "title": "Hermes: unlocking security analysis of cellular network protocols by synthesizing finite state machines from natural language specifications",
    323       "authors": ["A. Al Ishtiaq", "S. S. S. Das", "S. M. M. Rashid"],
    324       "year": 2024,
    325       "relevance": "Directly related work on using LLMs to synthesize FSMs from natural language specifications for security analysis."
    326     },
    327     {
    328       "title": "Can large language models transform natural language intent into formal method postconditions?",
    329       "authors": ["M. Endres", "S. Fakhoury", "S. Chakraborty", "S. K. Lahiri"],
    330       "year": 2024,
    331       "relevance": "Evaluates LLM capability for translating natural language to formal specifications, relevant to AI code generation quality."
    332     },
    333     {
    334       "title": "Constrained decoding for secure code generation",
    335       "authors": ["Y. Fu", "E. Baker", "Y. Ding", "Y. Chen"],
    336       "year": 2024,
    337       "relevance": "Constrained decoding technique for improving LLM code generation quality and safety."
    338     },
    339     {
    340       "title": "Type-constrained code generation with language models",
    341       "authors": ["N. Mündler", "J. He", "H. Wang", "K. Sen", "D. Song", "M. Vechev"],
    342       "year": 2025,
    343       "relevance": "Constrained decoding approach for code generation ensuring type correctness."
    344     },
    345     {
    346       "title": "Fidelity of cloud emulators: The imitation game of testing cloud-based software",
    347       "authors": ["A. Mazhar", "S. S. Alam", "W. X. Zheng", "Y. Chen", "S. Nath", "T. Xu"],
    348       "year": 2025,
    349       "relevance": "Directly related work evaluating behavioral gaps between cloud emulators and real cloud services."
    350     },
    351     {
    352       "title": "Cloud infrastructure management in the age of AI agents",
    353       "authors": ["Z. Yang", "A. Bhatnagar", "Y. Qiu"],
    354       "year": 2025,
    355       "relevance": "Related work on AI agents for cloud management, motivated by the cloud gym use case discussed in this paper."
    356     },
    357     {
    358       "title": "Restless: Enhancing state-of-the-art REST API fuzzing with LLMs in cloud service computing",
    359       "authors": ["T. Zheng", "J. Shao", "J. Dai"],
    360       "year": 2024,
    361       "relevance": "LLM-enhanced API fuzzing for cloud services, related to AI-assisted software testing."
    362     },
    363     {
    364       "title": "SpecGen: Automated generation of formal program specifications via large language models",
    365       "authors": ["L. Ma", "S. Liu", "Y. Li", "X. Xie", "L. Bu"],
    366       "year": 2024,
    367       "relevance": "LLM-based automated formal specification generation, directly relevant to AI code synthesis quality."
    368     }
    369   ]
    370 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs