scan.json (25149B)
1 { 2 "paper": { 3 "title": "Towards Formal Verification of LLM-Generated Code from Natural Language Prompts", 4 "authors": ["Aaron Councilman", "David Jiahao Fu", "Aryan Gupta", "Chengxiao Wang", "David Grove", "Yu-Xiong Wang", "Vikram Adve"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2507.13290", 8 "doi": "10.48550/arXiv.2507.13290" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Astrogator, a formal verification system for LLM-generated Ansible code, correctly accepts 82.9% of correct programs and rejects 92.4% of incorrect programs on a benchmark of 21 tasks with 1260 generated programs across 6 LLMs. Only 26.5% of LLM-generated Ansible programs were correct, with GPT-4o significantly outperforming open-source models (51.4% vs 21.5%). The paper demonstrates that a formal query language combined with a State Calculus and symbolic interpreter can provide formal correctness guarantees for DSL code generation.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. Astrogator's implementation is described but not released." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The benchmark suite of 21 tasks is described (NL descriptions and formal queries in Appendix A) but no downloadable dataset or archive of the 1260 generated programs is provided." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions using VMs (Debian 12.11.0, Ubuntu 24.04.2, RHEL 9.6) for testing but provides no environment specification for running Astrogator itself (no requirements.txt, Dockerfile, or dependency list)." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No reproduction instructions, README, or scripts are provided. The paper describes the methodology but does not provide step-by-step instructions for reproducing experiments." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Results are reported as raw counts and percentages (e.g., 82.9% acceptance of correct programs, 92.4% rejection of incorrect programs) with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims GPT-4o 'performs significantly better' (51.4% vs 21.5%) but provides no statistical test to support this comparative claim." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Results are reported with baseline context: 277/334 correct programs accepted (82.9%), 856/926 incorrect programs rejected (92.4%), and per-model breakdowns with absolute counts out of 210 programs each (Table 3)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The benchmark suite has 21 tasks with 10 generations per LLM per task (1260 total). No justification is given for why 21 benchmarks or 10 samples per task is sufficient." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance or standard deviation is reported. LLMs are nondeterministic and prompted 10 times each, but no spread measures are provided across runs." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": false, 68 "justification": "There is no comparison to any alternative verification method, testing approach, or prior verification tool. Astrogator is evaluated in isolation." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": false, 73 "justification": "No baselines are included at all, so contemporaneity cannot be assessed." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "Astrogator has multiple components (formal query language, State Calculus, symbolic interpreter, unifier, knowledge base) but no ablation study is performed to measure individual contributions." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper reports both acceptance rate of correct programs (82.9%) and rejection rate of incorrect programs (92.4%), effectively measuring both precision and recall of the verifier." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation is conducted. The formal query language is designed for user review but no user study validates that users can actually understand and verify formal queries." 89 }, 90 "held_out_test_set": { 91 "applies": false, 92 "answer": false, 93 "justification": "This is not a machine learning system being trained; it is a formal verification tool. There is no training/test split to discuss." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 4 provides per-benchmark breakdowns of correct/incorrect programs accepted/rejected, and Table 3 provides per-model breakdowns of error types." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 6.3 extensively discusses both false rejections (57 correct programs rejected) and false acceptances (70 incorrect programs accepted), categorizing root causes including unsupported features, knowledge base limitations, and under-specified queries." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper honestly reports that only 26.5% of LLM-generated code is correct, that Astrogator rejects 57 correct programs and accepts 70 incorrect ones, and details the specific limitations causing these failures." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims 83% verification of correct code and 92% identification of incorrect code, matching the results in Section 6.3 (82.9% and 92.4% respectively)." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper's causal claims are primarily about the verification system's design choices. Claims about why errors occur (e.g., knowledge base returning single values, unsupported features) are justified by detailed analysis of each failure case in Section 6.3." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper is clear that Astrogator is specific to Ansible (Section 7.2: 'While Astrogator is specific to Ansible'), and future work sections (Bash, Arduino) are presented as illustrative examples with hand-translations, not claimed as fully supported." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not discuss alternative explanations for the observed verification rates. For example, it does not consider whether the 83%/92% rates are artifacts of the benchmark suite's simplicity or the particular LLMs chosen." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper is careful to distinguish between its test-based ground truth (which is imperfect) and true correctness, explicitly acknowledging in Section 6.1 that tests are used because 'no other solution was feasible' and that they 'have done our best to craft strong tests.'" 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Models are listed by name (Deepseek Coder 6.7b, Granite 8b Code, Qwen2.5 Coder 3b, Llama 3.1 8b, Starcoder 2 15b, GPT-4o) but no specific version strings or snapshot dates are given for GPT-4o." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "The full code-generation prompt is provided in Appendix B." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No temperature, top-p, or other generation hyperparameters are reported for any of the 6 LLMs used." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The LLMs are prompted directly with a single prompt to generate Ansible playbooks." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 6.2 describes post-processing of generated code: extracting key elements and inserting into a template, with counts of failures (42 syntax errors, 5 non-terminating programs)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 7 (Discussion) covers engineering effort limitations and generality limitations. Section 6.3 extensively discusses verification accuracy limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "The paper discusses specific threats: the benchmark suite is hand-crafted and small (21 tasks), the knowledge base returns only one correct value causing false rejections, test-based ground truth may have errors, and the formal query language coverage is limited." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "The paper explicitly states scope boundaries: focused on Ansible DSL, does not support shell commands, knowledge base is hand-curated, Bash and Arduino examples are hand-translated only. Section 7.1 acknowledges significant engineering effort required." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "The 1260 generated programs, test scripts, VM configurations, and Astrogator source code are not publicly available." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 6.1 describes benchmark construction from StackOverflow, Ansible Forum, Ansible Galaxy, and interesting variations. Section 6.2 describes code generation: 6 LLMs, 10 programs each, 21 benchmarks." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. The 'data' is LLM-generated code and a hand-crafted benchmark suite." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The full pipeline is documented: LLM generation → post-processing (42 syntax failures, 5 non-terminating) → VM testing (Debian, Ubuntu, RHEL) → correctness classification → verifier evaluation. Counts at each stage are provided in Section 6.2." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Acknowledgments section discloses funding from IBM-ILLINOIS Discovery Accelerator Institute (IIDAI) and NSF (Delta computing resource, ACCESS program)." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: five authors from UIUC and one from IBM Research. David Grove is from IBM Research, and the work is funded by IBM-ILLINOIS partnership." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "IBM and NSF fund the research but do not have a direct financial stake in the specific verification results. The paper does not evaluate IBM products." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is provided. Given IBM Research co-authorship and IBM-ILLINOIS funding, a competing interests declaration would be expected." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the 6 LLMs used to generate code." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "The benchmark tasks are derived from StackOverflow and Ansible Forum posts, which are likely in the training data of the LLMs. This potential overlap is not discussed." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "The benchmark is custom-built from public sources (StackOverflow, Ansible Forum, Ansible Galaxy) but no contamination analysis is performed. Since these sources are likely in LLM training data, the 26.5% correctness rate may be influenced by contamination." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Section 6.3 reports: 'Astrogator took about 70 seconds to process the 1260 programs,' giving wall-clock time for the verification pipeline." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "While verification time is mentioned (70 seconds for 1260 programs), the total compute budget for LLM code generation across 6 models and 1260 programs is not stated." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "LLMs are prompted 10 times per task to get diverse outputs, but no seed sensitivity analysis or variance across generations is reported." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Section 6.2: 'we prompt each for ten programs for each problem to further increase the diversity of our program set, resulting in a total of 1,260 programs.'" 302 }, 303 "hyperparameter_search_budget": { 304 "applies": false, 305 "answer": false, 306 "justification": "No hyperparameter tuning is performed for the verification system. The LLMs are used off-the-shelf." 307 }, 308 "best_config_selection_justified": { 309 "applies": false, 310 "answer": false, 311 "justification": "No configuration selection is performed; the verifier has a fixed design without tunable parameters." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors built both the verifier and the benchmark suite, including the formal queries and reference solutions. This self-evaluation bias is not acknowledged or discussed." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": false, 325 "answer": false, 326 "justification": "The verifier is deterministic and not compute-variable. LLM comparisons are across model size, not compute budget." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "Section 6.1 discusses benchmark design: tasks are 'common tasks based on top posts on StackOverflow and the Ansible Forum' with average 1.67 tasks per reference solution, comparable to prior work [37]. The limitations of test-based ground truth are explicitly discussed." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is used. LLMs are prompted directly without agents or tool use." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "The benchmark is derived from StackOverflow and Ansible Forum posts that predate the LLMs' training. No temporal analysis is discussed." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the prompt format or task descriptions leak information about expected solutions." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether benchmark tasks are similar to training examples. StackOverflow-derived tasks are likely well-represented in training data." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is used." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Astrogator accepts 82.9% of correct LLM-generated Ansible programs (277/334)", 365 "evidence": "Section 6.3, Table 4: per-benchmark breakdown of accepted/rejected programs across all 21 benchmarks", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Astrogator rejects 92.4% of incorrect LLM-generated Ansible programs (856/926)", 370 "evidence": "Section 6.3, Table 4: per-benchmark breakdown showing rejection counts for incorrect programs", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Only 26.5% of LLM-generated Ansible programs are correct (334/1260)", 375 "evidence": "Section 6.2, Table 3: breakdown by model showing syntax errors, execution errors, test failures, and correct programs", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "GPT-4o significantly outperforms open-source models at Ansible code generation (51.4% vs 21.5%)", 380 "evidence": "Table 3: GPT-4o produces 108 correct programs (51.4%) vs open-source models averaging 21.5%", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "All incorrect rejections by Astrogator are due to addressable implementation limitations, not fundamental approach limitations", 385 "evidence": "Section 6.3: 18 due to unsupported features (blocks, handlers, shell commands), 39 due to knowledge base returning single values", 386 "supported": "moderate" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "Self-built benchmark with no external validation", 392 "detail": "The authors created both the benchmark suite (21 tasks), the formal queries, and the verification system. No independent validation of benchmark quality or formal query correctness is provided. The benchmark is small (21 tasks) for the claims being made." 393 }, 394 { 395 "flag": "No comparison to alternative approaches", 396 "detail": "Astrogator is evaluated in isolation with no comparison to testing, fuzzing, static analysis, or other verification approaches. The 83%/92% rates have no baseline for comparison." 397 }, 398 { 399 "flag": "Missing hyperparameters for LLM generation", 400 "detail": "Temperature, top-p, and other generation parameters are not reported for any of the 6 LLMs, making the code generation portion of the experiment non-reproducible." 401 }, 402 { 403 "flag": "Contamination risk unaddressed", 404 "detail": "Benchmark tasks are derived from StackOverflow and Ansible Forum, which are likely in LLM training data. The 26.5% correctness rate and model comparisons may be influenced by training data overlap." 405 } 406 ], 407 "cited_papers": [ 408 { 409 "title": "Evaluating Large Language Models Trained on Code", 410 "authors": ["Mark Chen", "Jerry Tworek"], 411 "year": 2021, 412 "arxiv_id": "2107.03374", 413 "relevance": "HumanEval benchmark for LLM code generation evaluation, foundational to the code generation evaluation field." 414 }, 415 { 416 "title": "DeepSeek-Coder: When the Large Language Model Meets Programming", 417 "authors": ["Daya Guo", "Qihao Zhu"], 418 "year": 2024, 419 "arxiv_id": "2401.14196", 420 "relevance": "One of the open-source code generation models evaluated in this paper's experiments." 421 }, 422 { 423 "title": "What's Wrong with Your Code Generated by Large Language Models? An Extensive Study", 424 "authors": ["Shihan Dou", "Haoxiang Jia"], 425 "year": 2024, 426 "arxiv_id": "2407.06153", 427 "relevance": "Study of bugs in LLM-generated code, relevant to understanding code quality and verification needs." 428 }, 429 { 430 "title": "Do Users Write More Insecure Code with AI Assistants?", 431 "authors": ["Neil Perry", "Megha Srivastava", "Deepak Kumar", "Dan Boneh"], 432 "year": 2023, 433 "doi": "10.1145/3576915.3623157", 434 "relevance": "Security implications of AI code assistants, motivating the need for formal verification." 435 }, 436 { 437 "title": "Can Large Language Models Transform Natural Language Intent into Formal Method Postconditions?", 438 "authors": ["Madeline Endres", "Sarah Fakhoury"], 439 "year": 2024, 440 "doi": "10.1145/3660791", 441 "relevance": "LLM formalization of natural language into formal specifications, directly related approach." 442 }, 443 { 444 "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?", 445 "authors": ["Carlos E Jimenez", "John Yang"], 446 "year": 2024, 447 "relevance": "Major benchmark for LLM code generation in real-world settings." 448 }, 449 { 450 "title": "A Large-Scale Survey on the Usability of AI Programming Assistants: Successes and Challenges", 451 "authors": ["Jenny T. Liang", "Chenyang Yang", "Brad A. Myers"], 452 "year": 2024, 453 "doi": "10.1145/3597503.3608128", 454 "relevance": "Survey on AI code assistant usability, documenting challenges users face verifying LLM-generated code." 455 }, 456 { 457 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation", 458 "authors": ["Jiawei Liu", "Chunqiu Steven Xia"], 459 "year": 2023, 460 "relevance": "Rigorous evaluation finding errors in HumanEval ground truth, relevant to code generation evaluation methodology." 461 }, 462 { 463 "title": "On the Effectiveness of Large Language Models in Domain-Specific Code Generation", 464 "authors": ["Xiaodong Gu", "Meng Chen"], 465 "year": 2025, 466 "doi": "10.1145/3697012", 467 "relevance": "Study of LLM effectiveness for DSL code generation, directly relevant to this paper's Ansible focus." 468 }, 469 { 470 "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models", 471 "authors": ["Shraddha Barke", "Michael B. James", "Nadia Polikarpova"], 472 "year": 2023, 473 "doi": "10.1145/3586030", 474 "relevance": "Study of programmer interaction with AI code assistants, relevant to understanding verification needs." 475 }, 476 { 477 "title": "LLM Hallucinations in Practical Code Generation: Phenomena, Mechanism, and Mitigation", 478 "authors": ["Ziyao Zhang", "Chong Wang"], 479 "year": 2025, 480 "doi": "10.1145/3728894", 481 "relevance": "Study of hallucinations in LLM code generation, a key problem this paper aims to address through formal verification." 482 } 483 ] 484 }