scan.json (22632B)
1 { 2 "paper": { 3 "title": "LLM-based framework to support the construction of valid formal models", 4 "authors": [ 5 "Gábor Guta", 6 "Gábor Kusper" 7 ], 8 "year": 2025, 9 "venue": "FMF-AI 2025 (International Conference on Formal Methods and Foundations of Artificial Intelligence)", 10 "doi": "10.17048/fmfai.2025.78" 11 }, 12 "scan_version": 2, 13 "active_modules": [], 14 "methodology_tags": [ 15 "case-study", 16 "qualitative" 17 ], 18 "key_findings": "The paper proposes a human-in-the-loop tool for validating LLM-generated UML models by deterministically back-translating them to natural language for comparison with original requirements. Testing on a single web-shop specification (32 functional, 15 nonfunctional requirements) reveals that mechanical back-translation operates at a different abstraction level than the original requirements, making direct comparison confusing. The authors acknowledge the tool currently lacks practical capability, identifying model-to-NL translation quality and bidirectional model management as key unsolved challenges.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No code repository, archive, or download link is provided anywhere in the paper. The tool is described as a prototype implemented in Python with LangChain and Streamlit, but no source code is released." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "The web-shop specification (32 functional, 15 nonfunctional requirements following IEEE 830) used as the test input is not released. Only fragments are shown in the paper (Listings 1-3)." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "The paper mentions Python, LangChain, Streamlit, and PlantUML as technologies used (Section 4.1) but provides no version numbers, requirements file, or environment specification sufficient to recreate the setup." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No reproduction instructions are provided. There are no README files, setup guides, or step-by-step instructions for replicating the experiments." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper reports no quantitative results whatsoever. The three experiments (Section 5) are entirely qualitative with no measurements, so no confidence intervals or error bars are provided." 47 }, 48 "significance_tests": { 49 "applies": false, 50 "answer": false, 51 "justification": "The paper makes no comparative quantitative claims. All comparisons between approaches are qualitative observations." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No effect sizes or quantitative measurements of any kind are reported. The paper claims 'our approach works' (Section 6) without quantifying the effect." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "The entire evaluation uses a single web-shop example specification. No justification is given for why this single example is sufficient to support the paper's claims." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "No repeated runs, no variance, no quantitative results of any kind are reported across the three experiments." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": false, 73 "justification": "Section 5.1 qualitatively describes testing 'available LLM tools' for generating specifications but provides no quantitative comparison against their approach. No structured baseline comparison exists." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": false, 78 "justification": "No baselines are quantitatively compared. Section 5.1 references existing LLM tools generically without naming specific tools or providing comparative results." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": false, 83 "justification": "The system has multiple components (LLM generation, rule-based back-translation, human validation UI) but no ablation study isolates their individual contributions." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": false, 88 "justification": "No evaluation metrics are defined or reported. The evaluation is entirely qualitative with no measurements." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "Despite the entire approach centering on human-in-the-loop validation, no user study or formal human evaluation is conducted. The authors appear to have tested the tool themselves without involving independent evaluators." 94 }, 95 "held_out_test_set": { 96 "applies": false, 97 "answer": false, 98 "justification": "No ML training or test sets are involved. This is a tool prototype tested on a single manually-created specification." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": false, 103 "justification": "No breakdowns by requirement type, diagram type, or any other category are provided. Results are discussed only in general qualitative terms." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 5.2 reports that 'just mechanistically translating back the models to natural language is not sufficient.' Section 5.3 identifies that back-translated descriptions operate at a different abstraction level than original specifications, causing confusion from 'mechanical enumerations' vs 'compact concepts.'" 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper honestly reports several negative findings: back-translation quality is insufficient (Section 5.2), abstraction level mismatches are problematic (Section 5.3), and the tool 'currently lacks the capability to be used in practice' (Section 6)." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The abstract modestly claims to 'propose a tool which facilitates the validation' and describes its two-step process. The paper does present this tool and demonstrate its workflow. The abstract does not overclaim quantitative results." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper claims the approach 'mitigates the risks associated with LLM black-box generation' (Abstract) and that 'it is evident that our approach works' (Section 6). These are causal claims about the effectiveness of the intervention, but the single-example qualitative evaluation with no control condition does not support causal inference." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The title claims support for 'construction of valid formal models' generally, but the evaluation uses only UML diagrams (class, sequence, use case) on a single web-shop example. No explicit statement bounds the generalization to this specific domain, diagram type, or application class." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "No alternative explanations for the observed challenges are discussed. For example, the paper does not consider whether the abstraction level mismatch could be due to prompt quality, LLM choice, or specification complexity rather than inherent limitations of the approach." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper claims its approach enables 'validation' and 'mitigates risks,' but what it actually demonstrates is that back-translation produces natural language text. Whether this output actually helps users validate models or reduces errors is not measured. The gap between producing back-translations and achieving validation is not discussed." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper refers only to 'a generic LLM with prompt-based configuration' (Section 4) and 'LLM tools' (Section 5.1). No specific model name, version, or provider is ever identified anywhere in the paper." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper mentions 'prompt-based configuration' and 'extensive prompting' (Section 5.1) but provides no actual prompt text. Only the output (PlantUML listings) is shown." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": false, 157 "justification": "No hyperparameters (temperature, top-p, max tokens, etc.) are reported for any LLM interaction." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "The system uses LangChain as an LLM integration framework in a pipeline architecture (NL → LLM → UML → rule-based back-translation), not agentic scaffolding with tools, retry logic, or autonomous decision-making." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": false, 167 "justification": "The paper mentions the specification follows IEEE 830 format (Section 1.1) and shows fragments, but does not document how the specification was preprocessed or formatted for LLM consumption, or how the LLM output was parsed into PlantUML." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 5 (Discussion) provides substantive discussion of challenges and limitations across three experiments (5.1-5.3). Section 6 (Conclusion) explicitly states the tool 'currently lacks the capability to be used in practice' and identifies two specific areas requiring further research." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": false, 179 "justification": "The discussion in Section 5 covers implementation challenges (abstraction level mismatch, mechanical enumerations) but does not discuss threats to the validity of the study's conclusions, such as whether the web-shop example is representative, whether the qualitative findings generalize, or whether author-only evaluation introduces bias." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "While Section 5.3 notes 'we support only the validation of the forward-engineering approach' and Section 6 acknowledges practical limitations, the paper does not explicitly state what its results do NOT show or which settings/domains are excluded from its claims." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "The full web-shop specification (32 functional + 15 nonfunctional requirements) is not available. Only fragments are shown in the paper (Listings 1-3). Generated UML models and back-translations are partially shown but not fully available." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 1.1 describes the motivational example: a web-shop application specification formulated according to IEEE 830 standard with 32 functional and 15 nonfunctional requirements, chosen because it 'adequately represents the typical real-world challenges.'" 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants were involved. The test data is a self-created specification, not a standard benchmark." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": false, 206 "justification": "The high-level workflow is shown in Figures 1 and 2, but the detailed pipeline from specification input through LLM processing to UML generation and back-translation is not documented with sufficient detail to understand each transformation step." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding source, grants, or sponsorship are mentioned anywhere in the paper. There is no acknowledgments section discussing funding." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly stated: Gábor Guta is from Axonmatics Kft. (a company), and Gábor Kusper is from the Mathematics and Informatics Institute at Eszterházy Károly University." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "No funding disclosure is present, so independence cannot be assessed. The first author's affiliation with Axonmatics Kft. suggests potential commercial interest in the tool, but this conflict is not discussed." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement is present. The first author is from Axonmatics Kft. which could potentially commercialize the tool, but no declaration is made." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": false, 234 "answer": false, 235 "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. It uses an LLM as a tool for specification translation, not to measure model knowledge or capability." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": false, 239 "answer": false, 240 "justification": "No benchmark evaluation is performed. The paper tests a tool pipeline, not model capabilities on a test set." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": false, 244 "answer": false, 245 "justification": "No benchmark evaluation is performed. The web-shop specification is a custom-created input, not a benchmark for measuring model performance." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are involved in the study. The tool is tested by the authors themselves." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants are involved in the study." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants are involved in the study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants are involved in the study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants are involved in the study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants are involved in the study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants are involved in the study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "The tool uses LLM API calls for generating UML from natural language but no inference costs, API costs, or latency figures are reported." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "No computational budget, hardware requirements, or resource consumption is stated for the tool or experiments." 295 } 296 } 297 }, 298 "claims": [ 299 { 300 "claim": "The two-step approach (LLM generation + deterministic back-translation + human validation) enables semantic verification of formal models", 301 "evidence": "Section 4 describes the workflow and Section 4.1 shows implementation with PlantUML examples (Listings 2-3) and requirement-to-model mappings. However, no empirical measurement of verification effectiveness is provided.", 302 "supported": "weak" 303 }, 304 { 305 "claim": "Mechanistic back-translation of formal models to natural language is insufficient for validation", 306 "evidence": "Section 5.2: 'just mechanistically translating back the models to natural language is not sufficient even if the textual representation is nicely formulated.' The key finding is that template-based descriptions don't operate at the same abstraction level as original specifications.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "The approach works but currently lacks practical capability", 311 "evidence": "Section 6: 'Based on our initial experiments described in the previous section, it is evident that our approach works, but we have identified two main areas of functionality that require further research.' And: 'it currently lacks the capability to be used in practice.'", 312 "supported": "weak" 313 }, 314 { 315 "claim": "Back-translated descriptions confuse users because compact concepts become mechanical enumerations at a different abstraction level", 316 "evidence": "Section 5.3 identifies two specific confusion sources: 'compact concepts are described as mechanical enumerations' and 'only a single model-specific aspect is presented.' Illustrated by Listing 3 where 'Add item to cart with quantity' becomes a multi-sentence field enumeration.", 317 "supported": "moderate" 318 } 319 ], 320 "red_flags": [ 321 { 322 "flag": "No quantitative evaluation", 323 "detail": "Despite claiming 'our approach works,' the paper reports zero quantitative metrics. No accuracy measurements, no user task performance, no error rates — only qualitative observations from the authors themselves." 324 }, 325 { 326 "flag": "Single example evaluation", 327 "detail": "The entire evaluation rests on one web-shop specification. No variation in domain, complexity, or specification style is tested. Claims about 'formal models' generally are unsupported by this single case." 328 }, 329 { 330 "flag": "Unspecified LLM", 331 "detail": "The paper never identifies which LLM was used, referring only to 'a generic LLM with prompt-based configuration.' This makes the work completely unreproducible — different models would likely produce very different results." 332 }, 333 { 334 "flag": "No user study for a human-in-the-loop tool", 335 "detail": "The core value proposition is that humans can validate back-translated models, yet no independent users were tested. The authors evaluated their own tool, introducing obvious bias. Whether the approach actually helps users catch errors is unknown." 336 }, 337 { 338 "flag": "Undisclosed potential conflict of interest", 339 "detail": "First author is from Axonmatics Kft. (a company) which could have commercial interest in the tool. No competing interests statement or funding disclosure is provided." 340 } 341 ], 342 "cited_papers": [ 343 { 344 "title": "Formalising Software Requirements using Large Language Models", 345 "authors": ["A. Beg", "D. O'Donoghue", "R. Monahan"], 346 "year": 2025, 347 "arxiv_id": "2506.10704", 348 "doi": "10.48550/arxiv.2506.10704", 349 "relevance": "Directly addresses LLM use for generating formal specifications from natural language, a core application area for AI-assisted software engineering." 350 }, 351 { 352 "title": "nl2spec: Interactively Translating Unstructured Natural Language to Temporal Logics with Large Language Models", 353 "authors": ["M. Cosler", "C. Hahn", "D. Mendoza", "F. Schmitt", "C. Trippel"], 354 "year": 2023, 355 "arxiv_id": "2303.04864", 356 "doi": "10.48550/arXiv.2303.04864", 357 "relevance": "Introduces prompt engineering and structured intermediate formats for LLM-based formal specification translation, addressing reliability of LLM outputs." 358 }, 359 { 360 "title": "A Survey on Code Generation with LLM-based Agents", 361 "authors": ["Y. Dong", "X. Jiang", "J. Qian", "T. Wang", "K. Zhang", "Z. Jin", "G. Li"], 362 "year": 2025, 363 "relevance": "Comprehensive survey on LLM-based code generation agents, directly relevant to understanding the broader LLM programming landscape." 364 }, 365 { 366 "title": "Counterexample Guided Inductive Synthesis using Large Language Models and Satisfiability Solving", 367 "authors": ["S. K. Jha", "S. Jha", "P. Lincoln", "N. D. Bastian", "A. Velasquez", "R. Ewetz", "S. Neema"], 368 "year": 2023, 369 "arxiv_id": "2309.16436", 370 "doi": "10.48550/arxiv.2309.16436", 371 "relevance": "Combines LLMs with formal verification (SAT solving) for synthesis, exploring the intersection of LLM capabilities with formal methods." 372 }, 373 { 374 "title": "Safety analysis in the era of large language models: A case study of STPA using ChatGPT", 375 "authors": ["Y. Qi", "X. Zhao", "S. Khastgir", "X. Huang"], 376 "year": 2025, 377 "doi": "10.1016/j.mlwa.2025.100622", 378 "relevance": "Evaluates ChatGPT for safety-critical analysis with human validation, directly addressing reliability and human-in-the-loop needs for LLM-based tools." 379 }, 380 { 381 "title": "From Image to UML: First Results of Image Based UML Diagram Generation Using LLMs", 382 "authors": ["A. Conrardy", "J. Cabot"], 383 "year": 2024, 384 "arxiv_id": "2404.11376", 385 "doi": "10.48550/arXiv.2404.11376", 386 "relevance": "Explores LLM capabilities for generating UML models from images, relevant to understanding LLM limitations in model-driven engineering." 387 } 388 ] 389 }