scan.json (26612B)
1 { 2 "paper": { 3 "title": "Automating Structural Engineering Workflows with Large Language Model Agents", 4 "authors": [ 5 "Haoran Liang", 6 "Yufa Zhou", 7 "Mohammad Talebi-Kalaleh", 8 "Qipei Mei" 9 ], 10 "year": 2025, 11 "venue": "arXiv", 12 "arxiv_id": "2510.11004" 13 }, 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "A GitHub repository URL is provided at the top of the paper: https://github.com/DelosLiang/masse" 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Section 5.2 states: 'the released dataset is reorganized but faithfully mirrors the original cases, ensuring reproducibility of system performance.' The dataset of 100 scenarios is released alongside the code repository." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is mentioned in the paper. Section 4.1 mentions AutoGen version 0.9.2 in the case trajectory (Appendix D), but no systematic dependency listing is provided in the paper itself." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are described in the paper. While a GitHub link is provided and a dataset is mentioned, the paper does not include a 'Reproducing Results' section or describe specific commands to run." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": true, 41 "justification": "Figure 7b shows 95% confidence intervals computed via bootstrap for the runtime-performance relationship. However, the main results in Table 1 do not include confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper compares multiple LLM backends (Table 1) and claims one outperforms others, but no statistical significance tests (p-values, t-tests, etc.) are reported. Comparisons are made purely by comparing point estimates." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper provides baseline context for its claims. Section 6.2 reports '132 minutes per task' reduced to 'approximately two minutes, a reduction of over 98%.' Table 1 reports absolute scores across benchmarks with the baseline (no components) at 61.8 avg vs. full system at 88.5 avg." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The dataset consists of 100 scenarios and the human evaluation uses 11 engineers, but no justification for these sample sizes is provided. No power analysis or discussion of whether these numbers are sufficient for the claims being made." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Section 6.1.1 states 'Each problem was evaluated over ten independent trials' but Table 1 reports only single point estimates per benchmark per model without standard deviation or any spread measure. Figure 7b shows bootstrap CI for runtime but not for the main performance table." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper compares MASSE performance across multiple LLM backends (GPT-3.5-turbo, GPT-4o, Claude 3.5 Sonnet, o4-mini) in Table 1, and the ablation study (Table 2) compares against a no-component baseline." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The baselines include contemporary models: GPT-4o (2024), Claude 3.5 Sonnet (2024), and o4-mini (2025). GPT-3.5-turbo is included as a weaker comparison point. These are current frontier models." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Table 2 presents an ablation study varying agent memory (M) and JSON format (J) components across four benchmarks, showing the contribution of each component." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Four evaluation benchmarks are used: SAAB, SDAB, LAB, and MASEB, each measuring different aspects of the system's performance (structural analysis, design, loading, and holistic performance)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "Section 6.2 reports a comparative study with 11 experienced structural engineers completing a standardized racking system design task, comparing conventional workflow (132 min avg) against MASSE (approx. 2 min)." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "There is no mention of splitting the 100 scenarios into training/development and test sets. The paper does not describe using a held-out portion, and since GPT-5 is used as a judge, it is unclear if any tuning was done on the same data used for final evaluation." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 1 provides per-benchmark breakdowns (SAAB, SDAB, LAB, MASEB) for each model. Figure 2 provides a per-failure-mode breakdown for single-agent failures. The ablation study also shows per-benchmark results." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Figure 2 categorizes failure modes of the single-agent system into Formatting (20%), Dependency (50%), and Logic (30%). Appendix A provides further details on single- vs. two-agent failures." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that the single-agent system failed in all 10 trials (Figure 2). GPT-3.5-turbo underperformance is explicitly discussed (67.7 MASEB). The ablation shows that removing components hurts performance substantially." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims MASSE 'can reduce expert workload from approximately two hours to mere minutes' which is supported by Section 6.2 (132 min to ~2 min). The claim of 'enhancing both reliability and accuracy' is supported by Table 1 benchmark scores. The 'first Multi-Agent System for Structural Engineering' claim appears consistent with the related work review." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims about memory and JSON format improving performance (Table 2 ablation). The ablation study uses controlled single-variable manipulation (adding M, adding J, adding both) which is adequate for the causal claims about component contributions." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title says 'Automating Structural Engineering Workflows' broadly, but the evaluation is limited to a single task type: racking system design in British Columbia. While Section 5.1 acknowledges this is 'a representative task,' the paper's discussion (Section 7) and conclusion extrapolate to 'other domains where tasks are verbalizable, procedural, and tool-mediated' without bounding these claims to the tested setting." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not discuss alternative explanations for its results. For example, the 98% time reduction could partly be due to the specific task being well-suited to automation rather than the multi-agent architecture. The GPT-5 judge evaluation could have systematic biases. No threats-to-validity or confound discussion is present." 131 } 132 }, 133 "setup_transparency": { 134 "model_versions_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "Section 4.1 mentions 'GPT-4o', 'Claude 3.5 Sonnet', 'o4-mini', and 'GPT-3.5-turbo' without specific API version IDs or snapshot dates. 'GPT-4o' and 'Claude 3.5 Sonnet' are marketing names, not specific versions. No snapshot dates are provided." 138 }, 139 "prompts_provided": { 140 "applies": true, 141 "answer": true, 142 "justification": "Appendix B (Figures 8-10) provides the complete system instruction prompts for all agents: Loading Analyst, Seismic Analyst, Dynamic Analyst, Structural Analyst, Design Engineer, Model Engineer, Verification Engineer, Project Manager, and Safety Manager. These are the actual prompt texts used." 143 }, 144 "hyperparameters_reported": { 145 "applies": true, 146 "answer": true, 147 "justification": "Section 4.1 states: 'Temperatures are set to 0 for GPT-4o, Claude 3.5 Sonnet, and GPT-3.5-turbo, and 1 for o4-mini.' Appendix D also notes 'Max tokens = 2000'. The embedding model text-embedding-3 is mentioned for RAG." 148 }, 149 "scaffolding_described": { 150 "applies": true, 151 "answer": true, 152 "justification": "The agentic scaffolding is described in detail: Section 3 covers the three-team role design, Section 4 covers communication protocols and agent interactions, Figure 3 shows the overall framework, and Appendix C provides detailed component descriptions. The AutoGen framework is identified as the orchestration layer." 153 }, 154 "data_preprocessing_documented": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section 5.2 explains the dataset construction: 'constructed a domain-specific dataset consisting of one hundred different levels of difficulty, each paired with validated ground-truth solutions.' It notes the dataset was 'reorganized but faithfully mirrors the original cases' due to privacy constraints." 158 } 159 }, 160 "limitations_and_scope": { 161 "limitations_section_present": { 162 "applies": true, 163 "answer": false, 164 "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. The paper has Discussion (Section 7) and Ethics Statement sections but neither serves as a limitations discussion. The Discussion covers Transparency, Safety, and Real-World Impact as positive framing." 165 }, 166 "threats_to_validity_specific": { 167 "applies": true, 168 "answer": false, 169 "justification": "No specific threats to validity are discussed. The paper does not address limitations of the GPT-5 judge, potential biases in the evaluation, the restricted domain scope, or threats from using a single task type." 170 }, 171 "scope_boundaries_stated": { 172 "applies": true, 173 "answer": false, 174 "justification": "The paper does not explicitly state what the results do NOT show. The Ethics Statement says the system is 'for academic research and educational use' only, but this is a liability disclaimer, not a scope boundary about the generalizability of findings. The conclusion suggests 'future work will emphasize deployment in consulting environments' but does not explicitly bound current claims." 175 } 176 }, 177 "data_integrity": { 178 "raw_data_available": { 179 "applies": true, 180 "answer": false, 181 "justification": "While the dataset is said to be released, the paper does not mention releasing the raw experimental logs, traces, or individual trial results. The released data appears to be the problem scenarios, not the raw evaluation outputs from the 10 trials per problem." 182 }, 183 "data_collection_described": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section 5.2 describes the data collection: a domain-specific dataset of 100 scenarios derived from real-world racking system projects in British Columbia, reorganized for privacy. Each sample contains natural language descriptions, intermediate reasoning steps, and ground-truth solutions." 187 }, 188 "recruitment_methods_described": { 189 "applies": true, 190 "answer": false, 191 "justification": "The paper mentions '11 experienced structural engineers' in Section 6.2 but provides no details on how they were recruited, their specific experience levels, or whether they had any prior relationship with the authors. This could introduce selection bias." 192 }, 193 "data_pipeline_documented": { 194 "applies": true, 195 "answer": true, 196 "justification": "Appendix D provides a detailed step-by-step trace of the full analysis pipeline from problem input through all 10 steps to final safety assessment. Appendix E details the simulation setup and evaluation metrics with rubrics." 197 } 198 }, 199 "conflicts_of_interest": { 200 "funding_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "The Acknowledgement section states: 'This work was supported by Natural Sciences and Engineering Research Council of Canada (NSERC) through the Alliance Grant [ALLRP 581074-22].'" 204 }, 205 "affiliations_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Author affiliations are clearly stated: University of Alberta and Duke University. The paper evaluates third-party LLMs (OpenAI, Anthropic models), not products from the authors' own institutions." 209 }, 210 "funder_independent_of_outcome": { 211 "applies": true, 212 "answer": true, 213 "justification": "The funder is NSERC, a Canadian government funding agency that has no commercial stake in the outcome of LLM-based structural engineering automation research." 214 }, 215 "financial_interests_declared": { 216 "applies": true, 217 "answer": false, 218 "justification": "No competing interests or financial interests statement is present in the paper. The absence of disclosure is not the same as absence of conflict." 219 } 220 }, 221 "contamination": { 222 "training_cutoff_stated": { 223 "applies": true, 224 "answer": false, 225 "justification": "The paper uses GPT-4o, Claude 3.5 Sonnet, GPT-3.5-turbo, and o4-mini on its custom benchmarks but does not state the training data cutoff dates for any of these models. While the dataset is custom, the models could have seen similar structural engineering problems." 226 }, 227 "train_test_overlap_discussed": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper does not discuss whether the models may have been exposed to similar structural engineering problems or racking system designs during training. The dataset is derived from real-world projects, which could partially overlap with public engineering data." 231 }, 232 "benchmark_contamination_addressed": { 233 "applies": true, 234 "answer": false, 235 "justification": "The benchmarks (SAAB, SDAB, LAB, MASEB) are novel and custom, which partially mitigates contamination risk. However, the paper does not discuss contamination at all, nor does it address whether the underlying engineering knowledge tested could be in the training data." 236 } 237 }, 238 "human_studies": { 239 "pre_registered": { 240 "applies": true, 241 "answer": false, 242 "justification": "Section 6.2 describes a study with 11 engineers but there is no mention of pre-registration." 243 }, 244 "irb_or_ethics_approval": { 245 "applies": true, 246 "answer": false, 247 "justification": "The Ethics Statement discusses data confidentiality for the human evaluation but does not mention IRB or ethics board approval." 248 }, 249 "demographics_reported": { 250 "applies": true, 251 "answer": false, 252 "justification": "Section 6.2 describes participants only as '11 experienced structural engineers.' No demographics are reported: years of experience, education level, geographic distribution, or other characteristics." 253 }, 254 "inclusion_exclusion_criteria": { 255 "applies": true, 256 "answer": false, 257 "justification": "No inclusion or exclusion criteria are stated for the 11 engineer participants. It is unclear what 'experienced' means or how they were selected." 258 }, 259 "randomization_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "This is not a randomized experiment — all engineers completed the same task using the conventional workflow. There is no treatment/control comparison among human participants." 263 }, 264 "blinding_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "This is not a blinded study — engineers were asked to complete a specific task manually. There was no comparison condition where blinding would be relevant." 268 }, 269 "attrition_reported": { 270 "applies": true, 271 "answer": false, 272 "justification": "No information is provided about whether any engineers started but did not complete the study, or whether 11 was the initial or final number of participants." 273 } 274 }, 275 "cost_and_practicality": { 276 "inference_cost_reported": { 277 "applies": true, 278 "answer": true, 279 "justification": "Figure 7a provides a cost analysis showing token consumption (log10 scale) and runtime per method. Section 6.1.2 discusses the cost-performance tradeoff across models. However, actual dollar costs are not reported." 280 }, 281 "compute_budget_stated": { 282 "applies": true, 283 "answer": false, 284 "justification": "While token consumption and runtime are visualized in Figure 7a, no total computational budget is stated (e.g., total API spend, total number of API calls across all experiments). The paper does not quantify the total cost of running 100 scenarios × 10 trials × 4 models." 285 } 286 } 287 }, 288 "claims": [ 289 { 290 "claim": "MASSE can reduce expert workload from approximately two hours to mere minutes (over 98% reduction).", 291 "evidence": "Section 6.2: 11 engineers averaged 132 minutes per task; MASSE with GPT-4o completed in approximately 2 minutes.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "Reasoning model o4-mini provides the most stable and generalizable performance across heterogeneous structural engineering tasks.", 296 "evidence": "Table 1: o4-mini achieves highest scores on SAAB (96.6), SDAB (91.4), MASEB (94.7), and average (94.1), outperforming all language models.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Multi-agent systems are necessary for long-horizon structural engineering tasks; single-agent fails 100% of the time.", 301 "evidence": "Figure 2 and Appendix A: Single-agent (GPT-4o) failed in all 10 trials with errors distributed across formatting (20%), dependency (50%), and logic (30%). Two-agent system succeeded in every trial.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Agent memory and structured JSON I/O are crucial components, with their combination yielding the best results.", 306 "evidence": "Table 2: Baseline (no M, no J) scores 61.8 avg; +M alone scores 66.5; +J alone scores 64.5; +M,J scores 88.5.", 307 "supported": "strong" 308 }, 309 { 310 "claim": "MASSE is the first multi-agent system for structural engineering.", 311 "evidence": "Introduction and abstract. The related work section (2.1) reviews prior LLM applications in civil engineering but identifies no prior multi-agent system for full structural engineering workflows.", 312 "supported": "moderate" 313 } 314 ], 315 "methodology_tags": [ 316 "benchmark-eval", 317 "case-study" 318 ], 319 "key_findings": "MASSE, a multi-agent LLM system for structural engineering, coordinates Analyst, Engineer, and Management agent teams to automate racking system design. Using custom benchmarks (SAAB, SDAB, LAB, MASEB) on 100 real-world-derived scenarios, the reasoning model o4-mini achieved the highest overall score (94.1 avg), while Claude 3.5 Sonnet and GPT-4o also performed well. A human evaluation with 11 structural engineers showed MASSE reduced task completion time from 132 minutes to approximately 2 minutes (98% reduction). Ablation studies demonstrated that both structured memory and JSON-based communication are essential components.", 320 "red_flags": [ 321 { 322 "flag": "No limitations section", 323 "detail": "The paper lacks any dedicated limitations or threats-to-validity discussion. The Discussion section (Section 7) frames everything positively (transparency, safety, real-world impact) without acknowledging weaknesses." 324 }, 325 { 326 "flag": "GPT-5 as automated judge without validation", 327 "detail": "The evaluation uses GPT-5 as an LLM Judge (Section E.2.1) to score MASSE outputs against ground truth. There is no discussion of the judge's reliability, potential biases, or correlation with human expert judgment. LLM judges can have systematic biases." 328 }, 329 { 330 "flag": "Single task domain generalized broadly", 331 "detail": "All evaluation is on racking system design in British Columbia, but the paper title, abstract, and discussion generalize to 'structural engineering workflows' broadly and even to 'knowledge-intensive sectors such as architecture, finance, and healthcare.'" 332 }, 333 { 334 "flag": "No variance reported for main results", 335 "detail": "Despite running 10 trials per problem, Table 1 reports only point estimates without standard deviations or confidence intervals. The reader cannot assess result stability." 336 }, 337 { 338 "flag": "Minimal human study methodology", 339 "detail": "The human evaluation (11 engineers) lacks demographics, recruitment methods, inclusion criteria, IRB approval, and attrition reporting. The comparison is between manual task completion time and automated system time, which conflates many factors." 340 }, 341 { 342 "flag": "Self-citation cluster", 343 "detail": "Several references (e.g., [17], [18], [31], [37], [41]-[43], [60]-[64]) appear to be works by co-authors (Liang, Zhou, Song, Shen) in unrelated areas (model compression, diffusion models, circuit complexity). These bulk up the reference list without contributing to the paper's technical narrative." 344 } 345 ], 346 "cited_papers": [ 347 { 348 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 349 "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"], 350 "year": 2023, 351 "arxiv_id": "2308.08155", 352 "relevance": "Core multi-agent orchestration framework used as the backbone of MASSE." 353 }, 354 { 355 "title": "Why Do Multi-Agent LLM Systems Fail?", 356 "authors": ["Mert Cemri", "Melissa Z Pan", "Shuyi Yang"], 357 "year": 2025, 358 "arxiv_id": "2503.13657", 359 "relevance": "Systematic analysis of failure modes in multi-agent LLM systems, directly relevant to understanding agentic system reliability." 360 }, 361 { 362 "title": "AI Agents That Matter", 363 "authors": ["Sayash Kapoor", "Benedikt Stroebl", "Zachary S Siegel"], 364 "year": 2024, 365 "arxiv_id": "2407.01502", 366 "relevance": "Highlights the need for rigorous benchmarking practices in LLM agent evaluation." 367 }, 368 { 369 "title": "Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks", 370 "authors": ["Adam Fourney", "Gagan Bansal", "Hussein Mozannar"], 371 "year": 2024, 372 "arxiv_id": "2411.04468", 373 "relevance": "General-purpose multi-agent system for complex task-solving, providing context for domain-specific multi-agent architectures." 374 }, 375 { 376 "title": "ChatDev: Communicative Agents for Software Development", 377 "authors": ["Chen Qian", "Wei Liu", "Hongzhang Liu"], 378 "year": 2023, 379 "arxiv_id": "2307.07924", 380 "relevance": "Multi-agent LLM framework for software engineering, a closely related domain to structural engineering automation." 381 }, 382 { 383 "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework", 384 "authors": ["Sirui Hong", "Xiawu Zheng", "Jonathan Chen"], 385 "year": 2023, 386 "arxiv_id": "2308.00352", 387 "relevance": "Multi-agent framework coordinating agents for software tasks, relevant to understanding role-based agent architectures." 388 }, 389 { 390 "title": "HyperAgent: Generalist Software Engineering Agents to Solve Coding Tasks at Scale", 391 "authors": ["Huy Nhat Phan", "Tien N Nguyen"], 392 "year": 2024, 393 "arxiv_id": "2409.16299", 394 "relevance": "Generalist software engineering agent evaluated at scale, relevant to benchmarking agentic programming systems." 395 }, 396 { 397 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 398 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"], 399 "year": 2023, 400 "relevance": "Foundational agent reasoning framework that MASSE builds upon for structured workflows." 401 }, 402 { 403 "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models", 404 "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"], 405 "year": 2023, 406 "relevance": "Structured reasoning framework for LLMs, foundational to the agent reasoning approaches used in MASSE." 407 }, 408 { 409 "title": "The Illusion of Diminishing Returns: Measuring Long Horizon Execution in LLMs", 410 "authors": ["Akshit Sinha", "Arvindh Arun", "Shashwat Goel"], 411 "year": 2025, 412 "arxiv_id": "2509.09677", 413 "relevance": "Directly relevant to understanding why single-agent systems fail on long-horizon tasks, motivating the multi-agent approach." 414 }, 415 { 416 "title": "Taxonomy of Failure Modes in Agentic AI Systems: Whitepaper", 417 "authors": ["Microsoft Corporation"], 418 "year": 2025, 419 "relevance": "Systematic taxonomy of failure modes in agentic AI systems, relevant to understanding and categorizing agent failures." 420 }, 421 { 422 "title": "Gödel Agent: A Self-Referential Agent Framework for Recursive Self-Improvement", 423 "authors": ["Xunjian Yin", "Xinyi Wang", "Liangming Pan"], 424 "year": 2024, 425 "arxiv_id": "2410.04444", 426 "relevance": "Self-evolving agent framework relevant to the survey's coverage of advanced agentic architectures." 427 } 428 ] 429 }