scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (35005B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "A fine-tuned large language model based molecular dynamics agent for code generation to obtain material thermodynamic parameters",
      6     "authors": [
      7       "Zhuo-Fan Shi",
      8       "Chunxiao Xin",
      9       "Tong Huo",
     10       "Yun-Tao Jiang",
     11       "Bowen Wu"
     12     ],
     13     "year": 2025,
     14     "venue": "Scientific Reports",
     15     "arxiv_id": null,
     16     "doi": "10.1038/s41598-025-92337-6"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims a 42.22% time reduction (supported by Fig. 4a), improved code generation (supported by Fig. 4c), and improved review capabilities (supported by Fig. 4d-f). The claims are generally supported by the presented results.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper claims MDAgent 'significantly improves' capabilities and 'reduces' time — these are causal claims. The study compares with/without MDAgent but with no randomization, no control for confounds (e.g., task familiarity, order effects), and an unspecified number of experts.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper claims MDAgent as 'a general paradigm that can be extended to text-to-code applications in the field of materials science' and states it 'possesses inherent scalability.' However, only 4 thermodynamic tasks on 2 simple materials (copper, diamond) using LAMMPS were tested — far too narrow to support these general claims.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No alternative explanations are discussed. The time savings could be due to novelty effects, task ordering, or expert familiarity. The code quality improvements from fine-tuning could be due to task-specific memorization. None of these are considered.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper measures expert satisfaction scores and task completion time, then frames these as evidence that MDAgent 'effectively assist[s] experts' and has 'potential applications in the field of materials science.' The gap between limited proxy measurements and broad effectiveness claims is not acknowledged.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations section. The Discussion mentions that MDAgent is 'a semi-automated intelligent assistant' due to LLM limitations, but this is a single paragraph embedded in the discussion, not a substantive limitations section.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed. The paper does not address potential confounds such as small expert sample, order effects, task selection bias, or evaluator subjectivity.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what the results do NOT show. While it mentions LAMMPS as 'a case study,' it simultaneously claims broad scalability without stating specific boundaries or untested scenarios.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgements state: 'We gratefully acknowledge the support of the project \"Key Technologies Research for Data Space Application Construction Oriented towards Computational Reproduction\"' and 'supported by National Key Laboratory of Data Space Technology and System.'",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All author affiliations are listed: Peking University School of Software and Microelectronics, National Key Laboratory of Data Space Technology and System, Advanced Institute of Big Data, and Institute of Information Engineering (CAS).",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Funding is from the National Key Laboratory of Data Space Technology and System, a government-affiliated research lab. They have no apparent financial stake in the outcome of LAMMPS code generation results.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper states: 'The authors declare no competing interests.'",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are defined: 'Molecular Dynamics Agent' (framework for automatic code generation/refinement), 'LAMMPS' (MD simulation software), 'semi-automated intelligent assistant'. Technical terms like 'QLoRA' used without definition but assumed domain knowledge.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three contributions explicitly stated: (1) MDAgent system for automating LAMMPS code generation, (2) LSCF-Dataset for fine-tuning, (3) LEQS-Dataset for evaluation. Novelty positioned as text-to-code (not text-to-text) with domain-specific fine-tuning.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Prior work reviewed (ChemLLM, MatterGen, ChemCrow, HoneyComb, ChatMOF). Paper positions itself as addressing a gap: prior work focuses on text-to-text, this work targets text-to-code for LAMMPS with dedicated datasets.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "Code and datasets released on GitHub (https://github.com/FredericVAN/PKU_MDAgent) and available upon request; however, environment specifications (Python version, dependency versions) and step-by-step reproduction instructions are absent from the paper.",
    124         "source": "haiku",
    125         "code_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "A GitHub repository URL is provided: https://github.com/FredericVAN/PKU_MDAgent. The paper states 'The data is also publicly available at [this URL].'",
    129           "source": "opus"
    130         },
    131         "data_released": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "The paper states 'The code and datasets supporting the findings of this study are available' at the GitHub URL. Both LSCF-Dataset and LEQS-Dataset are referenced as publicly available.",
    135           "source": "opus"
    136         },
    137         "environment_specified": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No requirements.txt, Dockerfile, conda environment, or detailed library versions are provided in the paper. Docker is mentioned as part of the system architecture but no reproducible environment specification is given.",
    141           "source": "opus"
    142         },
    143         "reproduction_instructions": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "The paper says 'Detailed description of the code implementation details please see the GitHub repository' but provides no step-by-step reproduction instructions in the paper itself. No README commands, scripts, or reproducing-results section.",
    147           "source": "opus"
    148         }
    149       },
    150       "statistical_methodology": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "Error bars shown in Figure 4a,b; one effect size quantified (42.22% time reduction). Missing: no statistical significance tests (no p-values, t-tests), no sample size justification or power analysis, no confidence intervals on comparative claims.",
    154         "source": "haiku",
    155         "confidence_intervals_or_error_bars": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No confidence intervals or error bars are reported for the main results. Fig. 4 shows bar charts without uncertainty measures. Expert scores and time comparisons are presented as point estimates only.",
    159           "source": "opus"
    160         },
    161         "significance_tests": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "The paper uses language like 'significantly improves' and 'significantly lower' but reports no statistical significance tests (no p-values, t-tests, etc.). Comparative claims are made by comparing raw numbers only.",
    165           "source": "opus"
    166         },
    167         "effect_sizes_reported": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "The paper reports the 42.22% reduction in average task time with baseline context. Thermodynamic calculations give absolute values from both methods alongside theoretical values (e.g., 3.37 vs 3.56 J/(cm³·K) for heat capacity, theoretical ~3.45 Å for lattice constant).",
    171           "source": "opus"
    172         },
    173         "sample_size_justified": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "The number of expert evaluators is never specified — only 'multiple experts' and 'several experts' are mentioned. The LSCF-Dataset has 167 scripts. No sample size justification or power analysis is provided for any evaluation.",
    177           "source": "opus"
    178         },
    179         "variance_reported": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "No standard deviations, variance, or spread measures are reported for any experimental results. It is unclear whether experiments were run multiple times.",
    183           "source": "opus"
    184         }
    185       },
    186       "evaluation_design": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "User studies with expert comparison, multiple metrics (time, satisfaction, generation scores, evaluation accuracy, MAE/MSE), human evaluation of scripts, and held-out test sets included. Gaps: no ablation studies isolating MDAgent components, baseline models not named, failure cases minimally discussed.",
    190         "source": "haiku",
    191         "baselines_included": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The paper compares MDAgent-assisted workflow vs. traditional manual methods (Fig. 4a,b). For code generation, fine-tuned LAMMPSLLM is compared against 'other large models of similar size that had not been fine-tuned' (Fig. 4c).",
    195           "source": "opus"
    196         },
    197         "baselines_contemporary": {
    198           "applies": true,
    199           "answer": false,
    200           "justification": "The specific baseline models compared in Fig. 4c are not clearly named in the paper text. Without knowing which models were compared, it is impossible to assess whether they are contemporary and competitive.",
    201           "source": "opus"
    202         },
    203         "ablation_study": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "MDAgent has multiple components (Manager, Planner, Worker, Evaluator, RAG, Tools). While Worker and Evaluator are evaluated separately, there is no systematic ablation removing components to measure their individual contributions.",
    207           "source": "opus"
    208         },
    209         "multiple_metrics": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Multiple evaluation metrics are used: task completion time (Fig. 4a), expert satisfaction scores (Fig. 4b), code generation quality scores (Fig. 4c), evaluator accuracy (Fig. 4d), MAE (Fig. 4e), and MSE (Fig. 4f).",
    213           "source": "opus"
    214         },
    215         "human_evaluation": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Human experts evaluated both the generated scripts (scoring 1-10) and the overall MDAgent workflow (satisfaction scores). Experts rated task assistance and code quality across multiple tasks.",
    219           "source": "opus"
    220         },
    221         "held_out_test_set": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "For the LEQS-Dataset: 'A random subset of the LEQS-Dataset will be used for fine-tuning... with a separate random subset designated for testing to ensure no overlap between the two.'",
    225           "source": "opus"
    226         },
    227         "per_category_breakdown": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Results are broken down across four thermodynamic tasks (heat capacity, lattice constant, melting point, thermal expansion) and separately for Worker evaluation (Fig. 4c), Evaluator evaluation (Fig. 4d-f), and overall assistance (Fig. 4a-b).",
    231           "source": "opus"
    232         },
    233         "failure_cases_discussed": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "No specific failure cases are shown or analyzed. The paper acknowledges that MDAgent is 'semi-automated' but does not discuss where or how the system fails on specific tasks.",
    237           "source": "opus"
    238         },
    239         "negative_results_reported": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "Every experiment shows positive results. No configurations that failed, approaches that were abandoned, or conditions where MDAgent performed worse than baselines are reported.",
    243           "source": "opus"
    244         }
    245       },
    246       "setup_transparency": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "Agent architecture and data preprocessing well-documented. Missing: base model versions/snapshot dates, actual prompts used, fine-tuning hyperparameters (learning rate, batch size, epochs), which models used for Worker/Evaluator.",
    250         "source": "haiku",
    251         "model_versions_specified": {
    252           "applies": true,
    253           "answer": false,
    254           "justification": "The paper references 'Qwen' and 'ChatGLM' without exact version numbers or snapshot dates. The base model used for fine-tuning is described only as 'open-source large models.' No specific model version (e.g., Qwen-7B-v1.0) is stated.",
    255           "source": "opus"
    256         },
    257         "prompts_provided": {
    258           "applies": true,
    259           "answer": false,
    260           "justification": "No actual prompt text is provided. The paper describes the roles of Manager, Worker, and Evaluator in natural language but does not include the actual prompts or system instructions used.",
    261           "source": "opus"
    262         },
    263         "hyperparameters_reported": {
    264           "applies": true,
    265           "answer": false,
    266           "justification": "QLoRA fine-tuning with Unsloth framework is mentioned but no specific hyperparameters (learning rate, temperature, top-p, LoRA rank, epochs, batch size) are reported.",
    267           "source": "opus"
    268         },
    269         "scaffolding_described": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "The agent architecture is described in detail: Manager receives inputs and coordinates tasks, Planner decomposes objectives, Worker generates scripts, Evaluator provides feedback via a Reflexion-inspired loop with score thresholds. RAG and Tool modules are also described. Fig. 2a provides an architecture diagram.",
    273           "source": "opus"
    274         },
    275         "data_preprocessing_documented": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "For LSCF-Dataset: data gathered from official docs, papers, and open-source projects at 1:2:2 ratio, screened for correctness, annotated with three-part structure, converted to Alpaca format. For LEQS-Dataset: tasks designed by senior scientists, scripts generated by LLM, then evaluated by experts using structured rubric.",
    279           "source": "opus"
    280         }
    281       },
    282       "data_integrity": {
    283         "applies": true,
    284         "answer": true,
    285         "justification": "Datasets publicly available on GitHub. LSCF/LEQS construction procedures detailed from source collection through annotation and format conversion. Expert evaluation pipeline documented. Gap: expert recruitment/selection criteria for evaluation not explicitly stated.",
    286         "source": "haiku",
    287         "raw_data_available": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "The paper states 'The code and datasets supporting the findings of this study are... publicly available at https://github.com/FredericVAN/PKU_MDAgent.' Both datasets (LSCF and LEQS) are referenced as available.",
    291           "source": "opus"
    292         },
    293         "data_collection_described": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "LSCF-Dataset: 167 scripts from official docs, papers, and open-source at 1:2:2 ratio, with 127 simulation and 40 modeling scripts. LEQS-Dataset: tasks designed by senior scientists covering thermal expansion, conductivity, density, and phase transitions.",
    297           "source": "opus"
    298         },
    299         "recruitment_methods_described": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The paper mentions 'several experts' and 'multiple experts in materials science' but provides no details on how many experts participated, how they were recruited, their qualifications, or whether recruitment could introduce bias.",
    303           "source": "opus"
    304         },
    305         "data_pipeline_documented": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "The data pipeline is documented for both datasets: LSCF goes through collection → screening/validation → annotation (3-part structure) → Alpaca format conversion. LEQS goes through task design → LLM generation → expert scoring with rubric → train/test split.",
    309           "source": "opus"
    310         }
    311       },
    312       "contamination": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Base model training cutoff date not stated. No discussion of whether LAMMPS concepts/examples could have appeared in pre-training data. Within-dataset test/train split is mentioned, but pre-training contamination not addressed.",
    316         "source": "haiku",
    317         "training_cutoff_stated": {
    318           "applies": true,
    319           "answer": false,
    320           "justification": "The base LLM's training data cutoff is never stated. The models fine-tuned (described only as 'open-source large models') could have pre-training data that includes LAMMPS scripts or solutions to the test tasks.",
    321           "source": "opus"
    322         },
    323         "train_test_overlap_discussed": {
    324           "applies": true,
    325           "answer": false,
    326           "justification": "The paper separates fine-tuning and test subsets of LEQS-Dataset, but does not discuss whether the base model's pre-training data might contain LAMMPS scripts, documentation, or solutions similar to the test tasks.",
    327           "source": "opus"
    328         },
    329         "benchmark_contamination_addressed": {
    330           "applies": true,
    331           "answer": false,
    332           "justification": "No discussion of whether the LAMMPS scripts, documentation, or similar thermodynamic problems used in evaluation were in the base model's pre-training data. Given that LAMMPS documentation is publicly available online, contamination risk is non-trivial.",
    333           "source": "opus"
    334         }
    335       },
    336       "human_studies": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "Involves human expert evaluation but lacks formal research protocols: no pre-registration, no IRB/ethics approval, no participant demographics (number, experience level), no blinding (experts knew they were testing MDAgent), and no attrition reporting.",
    340         "source": "haiku",
    341         "pre_registered": {
    342           "applies": true,
    343           "answer": false,
    344           "justification": "No pre-registration is mentioned. The expert study evaluating MDAgent was not pre-registered on any platform.",
    345           "source": "opus"
    346         },
    347         "irb_or_ethics_approval": {
    348           "applies": true,
    349           "answer": false,
    350           "justification": "No IRB or ethics board approval is mentioned despite the study involving human expert participants completing tasks and providing evaluations.",
    351           "source": "opus"
    352         },
    353         "demographics_reported": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "Participants are described only as 'materials science experts' and 'senior materials scientists.' No demographics (number, experience level, affiliation, gender) are reported.",
    357           "source": "opus"
    358         },
    359         "inclusion_exclusion_criteria": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "No inclusion or exclusion criteria are stated for expert participants. The paper simply references 'several experts' and 'multiple experts' without describing how they were selected.",
    363           "source": "opus"
    364         },
    365         "randomization_described": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "No randomization procedure is described for the comparison of MDAgent-assisted vs. manual task completion. It is unclear whether task order or assignment was randomized.",
    369           "source": "opus"
    370         },
    371         "blinding_described": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No blinding is described. For the assistant evaluation, experts obviously knew which method they were using. For code evaluation (Fig. 4c), it is not stated whether evaluators were blind to which model generated each script.",
    375           "source": "opus"
    376         },
    377         "attrition_reported": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "No attrition or dropout information is reported. The initial and final number of expert participants is not stated.",
    381           "source": "opus"
    382         }
    383       },
    384       "cost_and_practicality": {
    385         "applies": true,
    386         "answer": false,
    387         "justification": "No inference cost (latency) or computational budget reported. Paper mentions 'resource intensive' interaction with Docker but no quantified cost or computational requirements (GPU hours, API calls, memory).",
    388         "source": "haiku",
    389         "inference_cost_reported": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "No inference cost, tokens consumed, or per-task compute time is reported. The 42.22% time reduction refers to human task time, not computational cost.",
    393           "source": "opus"
    394         },
    395         "compute_budget_stated": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "No GPU hours, training time, total compute budget, or hardware specifications are reported for fine-tuning or inference.",
    399           "source": "opus"
    400         }
    401       },
    402       "experimental_rigor": {
    403         "seed_sensitivity_reported": {
    404           "applies": true,
    405           "answer": false,
    406           "justification": "No mention of multiple random seeds or sensitivity analysis. It is unclear whether fine-tuning or generation experiments were repeated across seeds.",
    407           "source": "opus"
    408         },
    409         "number_of_runs_stated": {
    410           "applies": true,
    411           "answer": false,
    412           "justification": "The number of experimental runs is never stated. Results are presented without indicating how many times each experiment was performed.",
    413           "source": "opus"
    414         },
    415         "hyperparameter_search_budget": {
    416           "applies": true,
    417           "answer": false,
    418           "justification": "No hyperparameter search budget is reported. QLoRA fine-tuning is mentioned but the number of configurations tried, search method, or compute spent on tuning is not discussed.",
    419           "source": "opus"
    420         },
    421         "best_config_selection_justified": {
    422           "applies": true,
    423           "answer": false,
    424           "justification": "No description of how the final model configuration was selected. Only the best results are presented with no explanation of the selection process.",
    425           "source": "opus"
    426         },
    427         "multiple_comparison_correction": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "Multiple comparisons are made across models, tasks, and metrics, but no statistical tests are performed at all, let alone corrections for multiple comparisons.",
    431           "source": "opus"
    432         },
    433         "self_comparison_bias_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "The authors evaluate their own MDAgent system and fine-tuned models. No acknowledgment of self-evaluation bias, and no independent evaluation is included.",
    437           "source": "opus"
    438         },
    439         "compute_budget_vs_performance": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "Performance is not reported as a function of compute budget. The fine-tuned model may use different compute than baselines, but this is not discussed or controlled for.",
    443           "source": "opus"
    444         },
    445         "benchmark_construct_validity": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "The LEQS-Dataset evaluation uses expert scores (1-10) based on a custom rubric. No discussion of whether this scoring rubric has been validated or whether it actually measures the capabilities claimed.",
    449           "source": "opus"
    450         },
    451         "scaffold_confound_addressed": {
    452           "applies": true,
    453           "answer": false,
    454           "justification": "When comparing fine-tuned vs. base models for code generation (Fig. 4c), it is not explicitly stated whether the same scaffold/prompting setup was used. The scaffold confound is not discussed.",
    455           "source": "opus"
    456         }
    457       },
    458       "data_leakage": {
    459         "temporal_leakage_addressed": {
    460           "applies": true,
    461           "answer": false,
    462           "justification": "No discussion of temporal leakage. The base models may have been trained on LAMMPS documentation and scripts that are similar to the evaluation tasks.",
    463           "source": "opus"
    464         },
    465         "feature_leakage_addressed": {
    466           "applies": true,
    467           "answer": false,
    468           "justification": "No discussion of whether the evaluation setup leaks information. The LEQS task descriptions may contain hints that artificially improve generation quality.",
    469           "source": "opus"
    470         },
    471         "non_independence_addressed": {
    472           "applies": true,
    473           "answer": false,
    474           "justification": "No discussion of whether fine-tuning and test data share structural similarities (e.g., similar LAMMPS patterns, same material types, similar script structures from the same domain).",
    475           "source": "opus"
    476         },
    477         "leakage_detection_method": {
    478           "applies": true,
    479           "answer": false,
    480           "justification": "No leakage detection or prevention method is applied. No overlap analysis, canary strings, or decontamination is mentioned beyond the basic train/test split.",
    481           "source": "opus"
    482         }
    483       }
    484     }
    485   },
    486   "claims": [
    487     {
    488       "claim": "MDAgent reduces task completion time by 42.22% compared to traditional manual methods",
    489       "evidence": "Figure 4a shows task elapsed time comparison in user study with expert participants",
    490       "supported": "strong"
    491     },
    492     {
    493       "claim": "Fine-tuning with domain-specific datasets improves LAMMPS code generation quality",
    494       "evidence": "Figure 4c shows fine-tuned LAMMPSLLM outperforms unfine-tuned models on script generation; Figure 4e,f show fine-tuned LammpsEvaluator has lower MAE/MSE than unfine-tuned",
    495       "supported": "strong"
    496     },
    497     {
    498       "claim": "Human experts maintain high satisfaction despite reduced effort using MDAgent",
    499       "evidence": "Figure 4b shows expert satisfaction ratings remain high despite time reduction (Fig 4a)",
    500       "supported": "moderate"
    501     },
    502     {
    503       "claim": "MDAgent enables novice materials scientists to complete complex simulation tasks",
    504       "evidence": "System designed to lower 'knowledge barrier' and help 'entry-level materials science practitioners', but testing only involved senior experts",
    505       "supported": "weak"
    506     },
    507     {
    508       "claim": "The LEQS-Dataset evaluation rubric validly assesses LAMMPS script quality",
    509       "evidence": "Experts discussed and developed evaluation criteria, but no external validation or inter-rater reliability statistics provided",
    510       "supported": "moderate"
    511     },
    512     {
    513       "claim": "LammpsEvaluator can accurately identify and score errors in LAMMPS scripts",
    514       "evidence": "Figure 4d-f show evaluation performance; paper acknowledges model 'is not yet ideal' with 'limitations in parameter size and data volume'",
    515       "supported": "weak"
    516     },
    517     {
    518       "claim": "MDAgent's approach is generalizable to other materials science software (VASP, etc.)",
    519       "evidence": "Authors cite 'established scaling law theories' and mention plans to extend to DFT, but no cross-software evaluation demonstrated",
    520       "supported": "weak"
    521     }
    522   ],
    523   "methodology_tags": [
    524     "case-study",
    525     "benchmark-eval"
    526   ],
    527   "key_findings": "MDAgent, an LLM-based multi-component agent system, reduces task completion time by 42% when generating LAMMPS simulation scripts compared to manual expert work, while maintaining expert satisfaction. Fine-tuning with custom datasets (LSCF for code generation, LEQS for evaluation) substantially improves both code quality and error detection accuracy relative to general-purpose models. However, the system remains semi-automated (requires human oversight), and the LammpsEvaluator component still exhibits non-ideal performance with meaningful error rates.",
    528   "red_flags": [
    529     {
    530       "flag": "No statistical significance testing",
    531       "detail": "Comparative claims lack p-values, confidence intervals, or significance tests; results presented descriptively only"
    532     },
    533     {
    534       "flag": "Author self-evaluation bias",
    535       "detail": "System developers funded by same institution evaluating their own work using expert panels they helped design"
    536     },
    537     {
    538       "flag": "Limited generalization scope",
    539       "detail": "Tested only on 4 thermodynamic tasks; no cross-task evaluation or evaluation on unseen problem types"
    540     },
    541     {
    542       "flag": "Expert sample not characterized",
    543       "detail": "Number of expert evaluators and their selection criteria not specified; unclear if results would replicate with different experts"
    544     },
    545     {
    546       "flag": "Missing reproduction details",
    547       "detail": "Base model versions, prompts, hyperparameters (learning rate, batch size, epochs), and dependency versions not provided"
    548     },
    549     {
    550       "flag": "Evaluator underperformance unquantified",
    551       "detail": "MAE/MSE values shown in figures but not compared to human expert baseline; 'not yet ideal' is vague"
    552     },
    553     {
    554       "flag": "No ablation studies",
    555       "detail": "Cannot isolate which components (fine-tuning, agent architecture, dataset choice) drive time/quality improvements"
    556     },
    557     {
    558       "flag": "Semi-automation limits claims",
    559       "detail": "System requires human-in-the-loop, contradicting claims about automating 'labor-intensive challenges'"
    560     },
    561     {
    562       "flag": "No training cutoff stated",
    563       "detail": "Base model training date not disclosed; cannot rule out pre-training contamination with LAMMPS examples"
    564     }
    565   ],
    566   "cited_papers": [
    567     {
    568       "title": "ChemCrow: Augmenting Large Language Models with Chemistry Tools",
    569       "relevance": "Prior agent system leveraging API tools for domain-specific tasks; inspired MDAgent's modular architecture"
    570     },
    571     {
    572       "title": "HoneyComb: A Flexible LLM-based Agent System for Materials Science",
    573       "relevance": "Prior LLM agent for materials science; MDAgent differentiates with fine-tuning and dedicated datasets"
    574     },
    575     {
    576       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    577       "relevance": "Self-correction mechanism adopted for MDAgent's Worker-Evaluator feedback loop"
    578     },
    579     {
    580       "title": "ChemLLM: A Chemical Large Language Model",
    581       "relevance": "Fine-tuned LLM for chemistry domain; comparable approach to MDAgent's domain-specific fine-tuning"
    582     },
    583     {
    584       "title": "MatterGen: A Generative Model for Inorganic Materials Design",
    585       "relevance": "Generative approach to materials science; contrasts with MDAgent's text-to-code focus"
    586     },
    587     {
    588       "title": "AutoGen: Enabling Next-gen LLM Applications Via Multi-agent Conversation",
    589       "relevance": "Multi-agent orchestration framework; conceptually related to MDAgent's Manager-Worker-Evaluator design"
    590     },
    591     {
    592       "title": "TaskWeaver: A Code-First Agent Framework",
    593       "relevance": "Code-first agent system; related approach to MDAgent's code generation focus"
    594     }
    595   ],
    596   "engagement_factors": {
    597     "practical_relevance": {
    598       "score": 1,
    599       "justification": "Useful only for materials scientists working with LAMMPS thermodynamic simulations; very narrow domain applicability."
    600     },
    601     "surprise_contrarian": {
    602       "score": 0,
    603       "justification": "Confirms the expected finding that fine-tuned LLMs can help with domain-specific code generation."
    604     },
    605     "fear_safety": {
    606       "score": 0,
    607       "justification": "No safety or security concerns raised; the domain is scientific simulation."
    608     },
    609     "drama_conflict": {
    610       "score": 0,
    611       "justification": "No controversy, no challenges to existing claims or institutions."
    612     },
    613     "demo_ability": {
    614       "score": 1,
    615       "justification": "GitHub repository exists but requires LAMMPS domain expertise and specific setup to try."
    616     },
    617     "brand_recognition": {
    618       "score": 0,
    619       "justification": "From Peking University, not a widely recognized AI lab; published in Scientific Reports, a broad-scope journal."
    620     }
    621   },
    622   "hn_data": {
    623     "threads": [],
    624     "top_points": 0,
    625     "total_points": 0,
    626     "total_comments": 0
    627   }
    628 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs