scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26715B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Large Language Model Agent for User-friendly Chemical Process Simulations",
      6     "authors": [
      7       "Jingkang Liang",
      8       "Niklas Groll",
      9       "Gürkan Sin"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2601.11650",
     14     "doi": "10.48550/arXiv.2601.11650"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims (autonomous analysis, iterative optimization, step-by-step guided construction, limitations like calculation errors) are all demonstrated with specific documented examples in the two case studies.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper makes implicit causal claims that the MCP-LLM framework 'enhances' usability and 'improves' accessibility, but two uncontrolled case studies with no comparison to manual expert work or alternative systems provide insufficient design for causal inference.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Conclusions describe LLM agents as 'valuable collaborators' in process engineering broadly, but all evidence comes from a single simple binary water-methanol system chosen for its simplicity; the acknowledgment of this in the conclusion does not sufficiently constrain the broad claims.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper does not consider alternative explanations for why the agent succeeds or fails (e.g., whether apparent thermodynamic reasoning reflects genuine understanding or recall of training data), focusing solely on demonstrating the framework.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper uses qualitative categories and case-study success as proxies for 'user-friendly' and 'accessible' simulation without distinguishing these proxies from actual productivity, learning, or usability gains for real users.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Limitations are discussed extensively throughout Section 5 and Section 6, including specific failure modes, calculation errors, unverified economic claims, and constraints requiring expert intervention.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific limitations are identified: arithmetic errors (10.9% vs. correct 12.9%), inability to recognize unspecified APS variables, incorrect tray efficiency suggestion, and the intentionally simple test system that limits generalizability.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Section 6 explicitly states the simple water-methanol system 'was intentionally selected to clearly demonstrate the framework's core capabilities' and calls for future work on complex industrial flowsheets with reactors, recycles, and heat integration.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 8 discloses: 'funded by the European Union Horizon Europe 2022 Research and Innovation Program under the Marie Sklodowska-Curie Grant Agreement No. 101119358 (PROSAFE)'.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors' affiliations (Process and System Engineering Center, DTU) are clearly stated on the title page.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The funder (EU Horizon Europe/PROSAFE research grant) is independent of commercial outcomes; the paper uses commercial software (AVEVA APS, Claude) but the funder has no financial stake in those products.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "There is no competing interests statement or declaration of financial interests anywhere in the paper, despite using and evaluating Anthropic's Claude model as the core system component.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms including 'LLM agent', 'Model Context Protocol (MCP)', 'MCP server/client', and the two interaction modes are defined and described in Sections 2 and 3.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper clearly states: developing an LLM agent framework via MCP that enables natural-language interaction with AVEVA Process Simulation, evaluated through two case studies demonstrating analysis, optimization, and synthesis.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 provides an extensive state-of-the-art review covering LLMs, agentic AI, chemical engineering applications, and explicitly positions this work relative to Rajeev et al. [45] (same APS platform) and other LLM-process simulation integrations.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "The MCP server toolset is described in detail (Tables 1 and A.1) but source code is not released; no repository link or code availability statement is provided.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "The case studies use a proprietary APS example file from AVEVA's commercial library; full prompts/responses are in Appendix B but no structured data artifact is released independently.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "APS version 2025 and Claude Sonnet 4.0 are mentioned, but no requirements file, Dockerfile, or dependency specification for the FastMCP-based MCP server is provided.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Appendix B provides full prompts, but reproduction requires an AVEVA APS 2025 license, Claude Desktop with MCP, and the unreleased MCP server code; no step-by-step reproduction instructions are provided.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "No confidence intervals or error bars are reported anywhere; evaluation is qualitative (Table 2 categories) and simulation results are single-run point values.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "The paper makes comparative claims about step-by-step vs. single-prompt modes but relies entirely on qualitative description; no statistical tests are applied.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "The optimization case (Answer 1.2) provides a before/after comparison table with percentage changes for six metrics (e.g., +10.9% methanol purity, +6.1% energy duty) relative to a baseline configuration.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Only two case studies on a single separation system are presented with no justification for why this is sufficient to support conclusions about LLM agent capabilities in process simulation.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "All results are from single runs; no variance, standard deviation, or repeated trial results are reported for any metric.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "No baseline comparison to manual expert construction, other LLMs, or other AI-assisted approaches is included; within-task comparisons (original vs. optimized settings) are not system-level baselines.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": false,
    185           "answer": false,
    186           "justification": "No baselines are included in the evaluation.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Case Study 2 compares two interaction modes (step-by-step dialogue vs. single-prompt autonomous) for the same flowsheet synthesis task, effectively ablating the level of user guidance.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Evaluation uses correctness, completeness, efficiency, and user satisfaction qualitatively; optimization provides quantitative metrics (purity, flow rate, energy duty); suggestions rated across 5 quality levels in Table 2.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "The domain-expert authors perform a qualitative evaluation of the agent's 11 process improvement suggestions using a 5-category rating scale in Table 2, with detailed justifications per suggestion.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": false,
    209           "answer": false,
    210           "justification": "Not applicable; this is a capability demonstration, not a prediction task with train/test splits.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Table 2 categorizes all 11 suggestions by quality, and Section 5 discusses them by category (Process Configuration, Operating Parameter, Advanced Configurations, Equipment Modifications).",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Numerous failure cases are documented in detail: arithmetic errors, incorrect tray efficiency claim, four unspecified APS variables set in single-prompt mode, unverified economic estimates, and incorrect feed stage optimality inference.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The paper explicitly reports negative results including the 'Potentially misleading' suggestion category, multiple technical errors in single-prompt mode, and the fundamental limitation that step-by-step guidance still requires domain expertise.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "The paper specifies 'Claude Sonnet 4.0 (Anthropic)' and 'APS (version 2025)' as the specific versions used, with reference [47] pointing to the Claude 4 announcement.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "All prompts are provided verbatim in Sections 4-5 and in full in Appendix B, including complete agent responses with tool call inputs and outputs.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "No LLM hyperparameters (temperature, top-p, max tokens, etc.) are reported; Claude Desktop default settings appear to have been used without documentation.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "The MCP architecture is thoroughly described in Section 3 (Figure 1, Tables 1 and A.1) including the toolset interface design, FastMCP server implementation, and APS Python API integration.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": false,
    259           "answer": false,
    260           "justification": "No data preprocessing is performed; inputs are natural language prompts and proprietary simulation files requiring no preprocessing pipeline.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "Appendix B contains complete prompt-response transcripts including all tool call inputs and outputs, providing the raw interaction data for both case studies.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Sections 4.2 and 4.3 describe the case study design specifying what tasks were given, what interaction modes were tested, and how evaluation was structured.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No participants were recruited; the case studies were conducted by the authors themselves using AVEVA's example library.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The full pipeline from user natural language → LLM agent → MCP server → APS Python API → simulation results is documented in Section 3 with Figure 1 and Table 1.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": false,
    293           "answer": false,
    294           "justification": "Not applicable; the paper uses the LLM as an agent for real-time tool use with a proprietary simulator, not to evaluate knowledge recall on benchmarks.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": false,
    299           "answer": false,
    300           "justification": "Not applicable; the test system (APS example file) is a proprietary commercial file, and evaluation is based on real-time tool use, not knowledge retrieval.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": false,
    305           "answer": false,
    306           "justification": "Not applicable; no publicly available benchmarks are used.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants; case studies were conducted by the authors.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No API costs or latency are reported despite multi-step agentic workflows with 12-23 tool calls per task using the commercial Claude API.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Total API usage and compute budget are not stated; the number of tool calls per case study is reported (18, 23) but not their associated cost.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "The LLM agent can autonomously analyze APS flowsheets and extract relevant information from thousands of variables, presenting a correct structured summary.",
    373       "evidence": "Agent accessed 356 of 2006 variables and presented a 24-variable summary with correct numerical values using 7 tool calls in Case Study 1; the paper notes 'all numerical values are reported as invariably correct'.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "The agent generates generally sound process improvement suggestions, with no fundamentally wrong recommendations.",
    378       "evidence": "Table 2 rates 11 suggestions: 3 Very Good, 4 Good but missing details, 3 Good but not relevant, 1 Potentially misleading, 0 Wrong; evaluation conducted by the authors as domain experts.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "The agent successfully optimizes reflux ratio to achieve >95 mol% methanol purity through iterative parameter adjustment without user guidance.",
    383       "evidence": "Answer 1.2 documents 12 tool calls testing 4 reflux values (1.5, 1.3, 1.4, 1.45) with APS-confirmed simulation results reaching 95.1 mol% purity.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Step-by-step dialogue mode produces reliable flowsheet construction with no problematic tool calls across 9 interaction steps.",
    388       "evidence": "Case Study 2 Variant 1 reports 18 tool calls setting 13 variables/parameters correctly, with the paper stating 'no problematic or unnecessary tool calls throughout the construction sequence'.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Single-prompt autonomous mode can construct a functional water-methanol separation flowsheet with minimal user guidance.",
    393       "evidence": "Case Study 2 Variant 2 achieves a converged simulation with 23 tool calls across 3 interaction steps, but with 4 attempts to set unspecified variables and minor redundancies.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "LLM-based agents require expert oversight due to arithmetic errors, hallucinated economic figures, and inability to distinguish applicable from inapplicable engineering concepts.",
    398       "evidence": "Specific documented failures include 10.9% reported vs. correct 12.9% calculation error, unverified '10-15% price premium' and '20-40% energy savings' claims, and the reactive distillation suggestion deemed irrelevant to the problem.",
    399       "supported": "strong"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "case-study"
    404   ],
    405   "key_findings": "An LLM agent (Claude Sonnet 4.0) integrated with AVEVA Process Simulation via Model Context Protocol successfully performs flowsheet analysis, iterative optimization, and autonomous flowsheet synthesis for a simple water-methanol binary separation. Step-by-step interaction mode achieves reliable construction with no errors across 18 tool calls, while single-prompt autonomous mode constructs functional flowsheets in 23 calls but makes technical mistakes including setting unspecified variables and providing unverified economic claims. The agent's improvement suggestions are generally sound (3/11 'Very Good', none 'Wrong') but show a tendency to overgenerate speculative ideas and to state unverified quantitative estimates as fact, requiring domain-expert oversight throughout.",
    406   "red_flags": [
    407     {
    408       "flag": "Single trivial test system",
    409       "detail": "All evaluation uses only a water-methanol binary distillation system, explicitly chosen for its simplicity; this is the simplest possible separation case and results cannot reliably generalize to complex industrial flowsheets with multiple units, recycles, or reactions."
    410     },
    411     {
    412       "flag": "No independent evaluation",
    413       "detail": "The authors evaluate their own system using their own domain expertise with no independent evaluators, blind assessment, or inter-rater reliability measurement for the qualitative Table 2 ratings."
    414     },
    415     {
    416       "flag": "No comparison baselines",
    417       "detail": "There is no comparison to manual expert performance, other LLMs, or other AI approaches; it is impossible to determine whether the framework offers any advantage over simply asking Claude questions directly or consulting an expert."
    418     },
    419     {
    420       "flag": "Unverified quantitative claims presented as facts",
    421       "detail": "Economic impact figures ('10-15% price premium', '15-25% energy savings', '~40% energy reduction') are stated in the agent output without verification; the authors acknowledge these originate from LLM training data but the paper still presents them without adequate disclaimer."
    422     },
    423     {
    424       "flag": "Core contribution (MCP server code) not released",
    425       "detail": "The MCP server implementation, which is the primary technical artifact enabling the framework, is not released, making the work unreproducible without both an AVEVA APS license and the unreleased code."
    426     },
    427     {
    428       "flag": "Self-reported qualitative evaluation only",
    429       "detail": "System performance is assessed through author-opinion qualitative categories with no quantitative success rate, no user study, and no comparison to ground truth, making it impossible to objectively assess framework utility."
    430     }
    431   ],
    432   "cited_papers": [
    433     {
    434       "title": "Multi-agent systems for chemical engineering: a review and perspective",
    435       "relevance": "Directly reviews MAS and LLM agent applications in chemical engineering, providing systematic context for this work's contribution."
    436     },
    437     {
    438       "title": "Autonomous chemical research with large language models (Coscientist)",
    439       "relevance": "Seminal work on autonomous LLM agents for chemistry lab automation; direct methodological predecessor to LLM-tool integration in scientific/engineering domains."
    440     },
    441     {
    442       "title": "Augmenting large language models with chemistry tools (ChemCrow)",
    443       "relevance": "Demonstrates LLM-tool integration for chemical tasks using a curated toolset, a direct methodological predecessor."
    444     },
    445     {
    446       "title": "LLM-guided Chemical Process Optimization with a Multi-Agent Approach",
    447       "relevance": "Most closely related work on LLM-guided process optimization with multi-agent coordination; direct comparison context for this single-agent approach."
    448     },
    449     {
    450       "title": "Application of Artificial Intelligence in process simulation tool (Rajeev et al.)",
    451       "relevance": "Direct predecessor integrating AI with AVEVA Process Simulation (same platform), providing the most relevant prior work baseline."
    452     },
    453     {
    454       "title": "GPT Prompt Engineering for a Large Language Model-Based Process Improvement Generation System",
    455       "relevance": "Uses GPT for process design improvements from flowsheet descriptions, directly related to Case Study 1's improvement suggestion task."
    456     },
    457     {
    458       "title": "ReAct: Synergizing reasoning and acting in language models",
    459       "relevance": "Foundational framework for LLM agents combining reasoning and tool use, the underlying paradigm for this work's agent architecture."
    460     }
    461   ],
    462   "engagement_factors": {
    463     "practical_relevance": {
    464       "score": 3,
    465       "justification": "Demonstrates a working integration with commercial process simulation software via the emerging MCP standard, directly applicable to practicing chemical engineers using AVEVA APS."
    466     },
    467     "surprise_contrarian": {
    468       "score": 1,
    469       "justification": "Using MCP as a standardized protocol for LLM-tool integration is relatively novel in chemical engineering, but LLM-assisted simulation is an expected direction with no findings that contradict prevailing assumptions."
    470     },
    471     "fear_safety": {
    472       "score": 0,
    473       "justification": "The paper explicitly frames LLM agents as collaborative tools requiring expert oversight, not autonomous replacements, and raises no safety risk concerns."
    474     },
    475     "drama_conflict": {
    476       "score": 0,
    477       "justification": "Straightforward capability demonstration with no controversy, competing claims, or industry conflict."
    478     },
    479     "demo_ability": {
    480       "score": 2,
    481       "justification": "The MCP integration runs on Claude Desktop (publicly accessible) but requires an AVEVA APS license (expensive proprietary software) and unreleased server code, limiting reproducibility."
    482     },
    483     "brand_recognition": {
    484       "score": 2,
    485       "justification": "Uses Anthropic's Claude (prominent AI brand) and AVEVA Process Simulation (industry-standard commercial tool in chemical engineering); DTU is a respected technical university."
    486     }
    487   },
    488   "hn_data": {
    489     "threads": [],
    490     "top_points": 0,
    491     "total_points": 0,
    492     "total_comments": 0
    493   }
    494 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs