scan-v5.json (21431B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "LLM-based framework to support the construction of valid formal models", 6 "authors": [ 7 "Gábor Guta", 8 "Gábor Kusper" 9 ], 10 "year": 2025, 11 "venue": "FMF-AI 2025", 12 "arxiv_id": null, 13 "doi": "10.17048/fmfai.2025.78" 14 }, 15 "checklist": { 16 "claims_and_evidence": { 17 "abstract_claims_supported": { 18 "applies": true, 19 "answer": false, 20 "justification": "Abstract claims the approach 'mitigates risks' and enables 'semantic verification,' but the conclusion explicitly states 'it currently lacks the capability to be used in practice' (p. 87). Evidence contradicts claims.", 21 "source": "haiku" 22 }, 23 "causal_claims_justified": { 24 "applies": true, 25 "answer": false, 26 "justification": "Paper claims human-in-the-loop validation 'mitigates risks' from LLMs but provides no comparative evidence—no error rates with/without validation, no baseline.", 27 "source": "haiku" 28 }, 29 "generalization_bounded": { 30 "applies": true, 31 "answer": false, 32 "justification": "Framework tested on single web-shop example (32 functional + 15 nonfunctional requirements). Title suggests broad applicability; generalizability to other domains/specification types not discussed.", 33 "source": "haiku" 34 }, 35 "alternative_explanations_discussed": { 36 "applies": true, 37 "answer": false, 38 "justification": "Paper does not discuss whether alternative approaches (multi-model consensus, automated verification, existing UML tools) might also mitigate LLM risks in formal modeling.", 39 "source": "haiku" 40 }, 41 "proxy_outcome_distinction": { 42 "applies": true, 43 "answer": false, 44 "justification": "Paper measures human agreement with back-translations but doesn't validate whether resulting UML models are actually correct or satisfy all requirements.", 45 "source": "haiku" 46 } 47 }, 48 "limitations_and_scope": { 49 "limitations_section_present": { 50 "applies": true, 51 "answer": false, 52 "justification": "No dedicated limitations or threats-to-validity section. Limitations scattered in discussion/conclusion (e.g., 'needs further fine-tuning') but not systematically catalogued.", 53 "source": "haiku" 54 }, 55 "threats_to_validity_specific": { 56 "applies": true, 57 "answer": false, 58 "justification": "No specific threats discussed: single example, no inter-rater reliability metrics, no baseline comparisons, no measurement of validation consistency.", 59 "source": "haiku" 60 }, 61 "scope_boundaries_stated": { 62 "applies": true, 63 "answer": false, 64 "justification": "Scope limitations (no bidirectional propagation, forward-engineering only) mentioned in future work section rather than explicitly stated as scope boundaries.", 65 "source": "haiku" 66 } 67 }, 68 "conflicts_of_interest": { 69 "funding_disclosed": { 70 "applies": true, 71 "answer": false, 72 "justification": "No funding source disclosed and no acknowledgments section provided in paper.", 73 "source": "haiku" 74 }, 75 "affiliations_disclosed": { 76 "applies": true, 77 "answer": true, 78 "justification": "Author affiliations disclosed: Axonmatics Kft. (tool developer) and Eszterházy Károly University.", 79 "source": "haiku" 80 }, 81 "funder_independent_of_outcome": { 82 "applies": false, 83 "answer": false, 84 "justification": "No external funding disclosed; cannot assess independence.", 85 "source": "haiku" 86 }, 87 "financial_interests_declared": { 88 "applies": true, 89 "answer": false, 90 "justification": "No competing interests statement or declaration of patents, equity, or financial interests provided.", 91 "source": "haiku" 92 } 93 }, 94 "scope_and_framing": { 95 "key_terms_defined": { 96 "applies": true, 97 "answer": false, 98 "justification": "Key terms 'semantic verification,' 'valid formal models,' and 'formal specification' used without precise definitions in paper's specific context.", 99 "source": "haiku" 100 }, 101 "intended_contribution_clear": { 102 "applies": true, 103 "answer": true, 104 "justification": "Contribution is clearly stated: LLM-assisted formal modeling with human-in-the-loop validation via back-translation, though authors admit tool lacks practical capability.", 105 "source": "haiku" 106 }, 107 "engagement_with_prior_work": { 108 "applies": true, 109 "answer": false, 110 "justification": "Section 2 lists related work but does not clearly articulate how this approach differs from or builds on existing LLM-based specification tools or human-in-the-loop validation methods.", 111 "source": "haiku" 112 } 113 } 114 }, 115 "type_checklist": { 116 "empirical": { 117 "artifacts": { 118 "code_released": { 119 "applies": true, 120 "answer": false, 121 "justification": "No source code released or made available. Implementation stack (Python, LangChain, Streamlit) is mentioned but no repository link or code access provided.", 122 "source": "haiku" 123 }, 124 "data_released": { 125 "applies": true, 126 "answer": false, 127 "justification": "Only one illustrative example (web-shop application) shown in paper. No dataset or benchmark of formal specifications released.", 128 "source": "haiku" 129 }, 130 "environment_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "Technology stack mentioned (Python, LangChain, Streamlit, PlantUML) but no requirements.txt, version specifications, or dependency documentation provided.", 134 "source": "haiku" 135 }, 136 "reproduction_instructions": { 137 "applies": true, 138 "answer": false, 139 "justification": "No step-by-step reproduction instructions. Tool is not publicly available; only high-level workflow description provided.", 140 "source": "haiku" 141 } 142 }, 143 "statistical_methodology": { 144 "confidence_intervals_or_error_bars": { 145 "applies": true, 146 "answer": false, 147 "justification": "No statistical results reported. Single qualitative case study with no quantitative metrics, confidence intervals, or error bars.", 148 "source": "haiku" 149 }, 150 "significance_tests": { 151 "applies": false, 152 "answer": false, 153 "justification": "No comparative claims between conditions, so significance testing not applicable.", 154 "source": "haiku" 155 }, 156 "effect_sizes_reported": { 157 "applies": true, 158 "answer": false, 159 "justification": "No effect sizes or percentage improvements reported.", 160 "source": "haiku" 161 }, 162 "sample_size_justified": { 163 "applies": true, 164 "answer": false, 165 "justification": "Sample size is 1 (one web-shop example) with no justification for adequacy or power analysis.", 166 "source": "haiku" 167 }, 168 "variance_reported": { 169 "applies": true, 170 "answer": false, 171 "justification": "No variance, standard deviations, or multiple runs reported.", 172 "source": "haiku" 173 } 174 }, 175 "evaluation_design": { 176 "baselines_included": { 177 "applies": true, 178 "answer": false, 179 "justification": "No baseline comparisons. Tool not compared to existing diagram generation tools (DiagramGPT, UMLAI), manual specification, or alternative approaches.", 180 "source": "haiku" 181 }, 182 "baselines_contemporary": { 183 "applies": false, 184 "answer": false, 185 "justification": "No baselines included.", 186 "source": "haiku" 187 }, 188 "ablation_study": { 189 "applies": true, 190 "answer": false, 191 "justification": "No ablation study. No comparison of system with/without human-in-the-loop, with/without back-translation, or with/without validation.", 192 "source": "haiku" 193 }, 194 "multiple_metrics": { 195 "applies": true, 196 "answer": false, 197 "justification": "No metrics provided. Only qualitative example shown with no measurement of model quality, validation accuracy, or usability.", 198 "source": "haiku" 199 }, 200 "human_evaluation": { 201 "applies": true, 202 "answer": false, 203 "justification": "While tool includes human validation by design, no formal user study or inter-rater agreement study conducted. Authors informally checked own example.", 204 "source": "haiku" 205 }, 206 "held_out_test_set": { 207 "applies": false, 208 "answer": false, 209 "justification": "Not a prediction task; held-out test set not applicable.", 210 "source": "haiku" 211 }, 212 "per_category_breakdown": { 213 "applies": true, 214 "answer": false, 215 "justification": "Only 4 of 47 requirements mapped in example (FR-5, FR-6, FR-7, FR-8). No breakdown by requirement category or analysis of which types handled well/poorly.", 216 "source": "haiku" 217 }, 218 "failure_cases_discussed": { 219 "applies": true, 220 "answer": false, 221 "justification": "General challenges mentioned (Section 5.3: 'compact concepts described as mechanical enumerations') but specific failure cases not systematically documented.", 222 "source": "haiku" 223 }, 224 "negative_results_reported": { 225 "applies": true, 226 "answer": false, 227 "justification": "Problems mentioned in conclusion ('tool currently lacks capability to be used in practice,' 'needs further fine-tuning') but not systematically reported as negative results.", 228 "source": "haiku" 229 } 230 }, 231 "setup_transparency": { 232 "model_versions_specified": { 233 "applies": true, 234 "answer": false, 235 "justification": "No specific LLM model named. Paper mentions 'generic LLM' without specifying GPT version, date, or model snapshot.", 236 "source": "haiku" 237 }, 238 "prompts_provided": { 239 "applies": true, 240 "answer": false, 241 "justification": "Paper mentions 'intensive prompting' improves quality but provides no actual prompts, system instructions, or prompt templates.", 242 "source": "haiku" 243 }, 244 "hyperparameters_reported": { 245 "applies": true, 246 "answer": false, 247 "justification": "No hyperparameters reported (temperature, top_p, max_tokens, etc.). No LLM configuration details provided.", 248 "source": "haiku" 249 }, 250 "scaffolding_described": { 251 "applies": true, 252 "answer": false, 253 "justification": "Human-in-the-loop approach described at high level; back-translation mentioned but specific scaffolding details (validation criteria, mapping rules, prompt structure) not provided.", 254 "source": "haiku" 255 }, 256 "data_preprocessing_documented": { 257 "applies": true, 258 "answer": false, 259 "justification": "No preprocessing steps documented. No description of how natural language requirements are parsed, normalized, or prepared.", 260 "source": "haiku" 261 } 262 }, 263 "data_integrity": { 264 "raw_data_available": { 265 "applies": true, 266 "answer": false, 267 "justification": "Single example specification partially shown in paper; full data not made available for independent verification.", 268 "source": "haiku" 269 }, 270 "data_collection_described": { 271 "applies": true, 272 "answer": false, 273 "justification": "Example appears to be author-created for illustration. How/why this example was chosen and whether representative not discussed.", 274 "source": "haiku" 275 }, 276 "recruitment_methods_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human study; recruitment not applicable.", 280 "source": "haiku" 281 }, 282 "data_pipeline_documented": { 283 "applies": true, 284 "answer": false, 285 "justification": "High-level workflow shown (Figure 1). Detailed pipeline from requirements parsing through validation not fully documented.", 286 "source": "haiku" 287 } 288 }, 289 "contamination": { 290 "training_cutoff_stated": { 291 "applies": false, 292 "answer": false, 293 "justification": "Tool paper, not evaluating model capabilities on benchmarks; not applicable.", 294 "source": "haiku" 295 }, 296 "train_test_overlap_discussed": { 297 "applies": false, 298 "answer": false, 299 "justification": "Not applicable to tool paper.", 300 "source": "haiku" 301 }, 302 "benchmark_contamination_addressed": { 303 "applies": false, 304 "answer": false, 305 "justification": "Not applicable to tool paper.", 306 "source": "haiku" 307 } 308 }, 309 "human_studies": { 310 "pre_registered": { 311 "applies": false, 312 "answer": false, 313 "justification": "No human subjects study conducted.", 314 "source": "haiku" 315 }, 316 "irb_or_ethics_approval": { 317 "applies": false, 318 "answer": false, 319 "justification": "No human subjects study; IRB not applicable.", 320 "source": "haiku" 321 }, 322 "demographics_reported": { 323 "applies": false, 324 "answer": false, 325 "justification": "No human participants.", 326 "source": "haiku" 327 }, 328 "inclusion_exclusion_criteria": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants.", 332 "source": "haiku" 333 }, 334 "randomization_described": { 335 "applies": false, 336 "answer": false, 337 "justification": "No human study; randomization not applicable.", 338 "source": "haiku" 339 }, 340 "blinding_described": { 341 "applies": false, 342 "answer": false, 343 "justification": "No human study; blinding not applicable.", 344 "source": "haiku" 345 }, 346 "attrition_reported": { 347 "applies": false, 348 "answer": false, 349 "justification": "No human study; attrition not applicable.", 350 "source": "haiku" 351 } 352 }, 353 "cost_and_practicality": { 354 "inference_cost_reported": { 355 "applies": true, 356 "answer": false, 357 "justification": "No inference cost, latency, or computational resource measurements reported.", 358 "source": "haiku" 359 }, 360 "compute_budget_stated": { 361 "applies": true, 362 "answer": false, 363 "justification": "No total computational budget, API call counts, or token usage disclosed.", 364 "source": "haiku" 365 } 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "The tool facilitates validation of formal models generated by LLMs from natural language specifications", 372 "evidence": "Section 3-4 describes the two-step validation approach with back-translation and human review", 373 "supported": "moderate" 374 }, 375 { 376 "claim": "Human-in-the-loop validation mitigates risks from LLM black-box generation", 377 "evidence": "Claimed in abstract and introduction, but no comparative error rates or empirical validation provided", 378 "supported": "weak" 379 }, 380 { 381 "claim": "The tool supports exploration of real-world system challenges", 382 "evidence": "Only single simplified web-shop example shown; claim of real-world relevance unsupported", 383 "supported": "weak" 384 }, 385 { 386 "claim": "Back-translation ensures semantic correctness of generated models", 387 "evidence": "Approach described but no formal verification that models are correct or complete", 388 "supported": "weak" 389 } 390 ], 391 "methodology_tags": [ 392 "case-study", 393 "qualitative" 394 ], 395 "key_findings": "The paper proposes a tool combining LLM-based formal model generation (UML) with human-in-the-loop validation through back-translation to natural language. Testing on a single web-shop example (32 functional + 15 nonfunctional requirements) demonstrates the workflow can generate and map some requirements. However, authors identify critical limitations: back-translation abstracts details in ways that confuse users ('compact concepts described as mechanical enumerations'), and the tool 'currently lacks the capability to be used in practice.' Bidirectional consistency propagation remains unimplemented.", 396 "red_flags": [ 397 { 398 "flag": "Mismatch between claims and reality", 399 "detail": "Abstract claims approach 'mitigates risks' and enables 'semantic verification,' but conclusion admits tool 'currently lacks capability to be used in practice.'" 400 }, 401 { 402 "flag": "No empirical validation", 403 "detail": "Zero metrics, zero baselines, zero user study. Core claim about risk mitigation never measured." 404 }, 405 { 406 "flag": "Single example only", 407 "detail": "One web-shop case study with 47 requirements, 4 fully shown. No evidence of generalizability across domains or specification types." 408 }, 409 { 410 "flag": "Unspecified LLM setup", 411 "detail": "No LLM model, version, date, prompts, or hyperparameters disclosed. Reproducibility impossible." 412 }, 413 { 414 "flag": "No code or data release", 415 "detail": "Tool not available for verification. Full specification for example not released." 416 }, 417 { 418 "flag": "Subjective validation", 419 "detail": "Human agreement on back-translation is subjective and unblinded. No inter-rater reliability measured." 420 }, 421 { 422 "flag": "Tool is incomplete prototype", 423 "detail": "Authors explicitly state bidirectional consistency not supported, only forward engineering, limiting practical utility." 424 }, 425 { 426 "flag": "No baselines", 427 "detail": "Not compared to existing UML generation tools (DiagramGPT, UMLAI), manual specification, or alternative validation methods." 428 }, 429 { 430 "flag": "Limited scope boundaries", 431 "detail": "Scope limitations (no bidirectional propagation) mentioned in future work rather than clearly stated upfront." 432 }, 433 { 434 "flag": "Overreaching contribution", 435 "detail": "Paper frames this as a 'framework' but presents an incomplete prototype for a single example." 436 } 437 ], 438 "cited_papers": [ 439 { 440 "title": "Formalising Software Requirements using Large Language Models", 441 "relevance": "Direct precedent: LLM-based formal requirement specification generation" 442 }, 443 { 444 "title": "nl2spec: Interactively Translating Unstructured Natural Language to Temporal Logics with Large Language Models", 445 "relevance": "Alternative approach to LLM-assisted formal specification; shows prior work in NL-to-logic translation" 446 }, 447 { 448 "title": "Counterexample Guided Inductive Synthesis using Large Language Models and Satisfiability Solving", 449 "relevance": "LLM role in formal artifact generation with synthesis techniques" 450 }, 451 { 452 "title": "Natural Language Generation from Class Diagrams", 453 "relevance": "Reverse direction (model-to-NL), foundational for back-translation validation approach" 454 }, 455 { 456 "title": "Safety analysis in the era of large language models: A case study of STPA using ChatGPT", 457 "relevance": "Human-in-the-loop validation in safety-critical domains; parallel approach for LLM risk mitigation" 458 }, 459 { 460 "title": "Reasoning on UML Class Diagrams", 461 "relevance": "Formal semantics of UML; theoretical foundation for understanding what makes UML models 'valid'" 462 } 463 ], 464 "engagement_factors": { 465 "practical_relevance": { 466 "score": 1, 467 "justification": "Tool is incomplete and authors explicitly state it 'currently lacks capability to be used in practice,' severely limiting immediate practical value." 468 }, 469 "surprise_contrarian": { 470 "score": 1, 471 "justification": "Combines well-known ideas (LLM code generation + human review + back-translation verification) without novel insight or contrarian perspective." 472 }, 473 "fear_safety": { 474 "score": 1, 475 "justification": "Addresses legitimate concern about LLM reliability in formal modeling but provides no evidence the approach actually mitigates risks." 476 }, 477 "drama_conflict": { 478 "score": 0, 479 "justification": "Standard software engineering tool-building paper with no controversial claims or notable conflict angle." 480 }, 481 "demo_ability": { 482 "score": 0, 483 "justification": "Tool is not released publicly; no runnable demo or reproducible example provided to readers." 484 }, 485 "brand_recognition": { 486 "score": 0, 487 "justification": "Authors affiliated with Axonmatics Kft. (company) and Eszterházy Károly Catholic University (non-elite institution); no major lab affiliation." 488 } 489 }, 490 "hn_data": { 491 "threads": [], 492 "top_points": 0, 493 "total_points": 0, 494 "total_comments": 0 495 } 496 }