scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30416B)
      1 {
      2   "paper": {
      3     "title": "A fine-tuned large language model based molecular dynamics agent for code generation to obtain material thermodynamic parameters",
      4     "authors": [
      5       "Zhuofan Shi",
      6       "Chunxiao Xin",
      7       "Tong Huo",
      8       "Yuntao Jiang",
      9       "Bowen Wu",
     10       "Xingyue Chen",
     11       "Wei Qin",
     12       "Xinjian Ma",
     13       "Gang Huang",
     14       "Zhenyu Wang",
     15       "Xiang Jing"
     16     ],
     17     "year": 2025,
     18     "venue": "Scientific Reports",
     19     "doi": "10.1038/s41598-025-92337-6"
     20   },
     21   "scan_version": 3,
     22   "active_modules": ["experimental_rigor", "data_leakage"],
     23   "methodology_tags": ["benchmark-eval", "case-study"],
     24   "key_findings": "MDAgent, an LLM-based agent framework for automating LAMMPS molecular dynamics simulation code, reduced average task completion time by 42.22% compared to manual methods in a small expert study. A fine-tuned model (LAMMPSLLM, 167-script training set) outperformed base models of similar size on expert-scored LAMMPS script generation. Fine-tuning also reduced MAE/MSE of the LLM evaluator's scores relative to human expert ratings. Four thermodynamic property calculations (heat capacity, lattice constant, melting point, thermal expansion of copper/diamond) produced results near theoretical values.",
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "A GitHub repository URL is provided: https://github.com/FredericVAN/PKU_MDAgent. The paper states 'The data is also publicly available at [this URL].'"
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper states 'The code and datasets supporting the findings of this study are available' at the GitHub URL. Both LSCF-Dataset and LEQS-Dataset are referenced as publicly available."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed library versions are provided in the paper. Docker is mentioned as part of the system architecture but no reproducible environment specification is given."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper says 'Detailed description of the code implementation details please see the GitHub repository' but provides no step-by-step reproduction instructions in the paper itself. No README commands, scripts, or reproducing-results section."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No confidence intervals or error bars are reported for the main results. Fig. 4 shows bar charts without uncertainty measures. Expert scores and time comparisons are presented as point estimates only."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper uses language like 'significantly improves' and 'significantly lower' but reports no statistical significance tests (no p-values, t-tests, etc.). Comparative claims are made by comparing raw numbers only."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The paper reports the 42.22% reduction in average task time with baseline context. Thermodynamic calculations give absolute values from both methods alongside theoretical values (e.g., 3.37 vs 3.56 J/(cm³·K) for heat capacity, theoretical ~3.45 Å for lattice constant)."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The number of expert evaluators is never specified — only 'multiple experts' and 'several experts' are mentioned. The LSCF-Dataset has 167 scripts. No sample size justification or power analysis is provided for any evaluation."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No standard deviations, variance, or spread measures are reported for any experimental results. It is unclear whether experiments were run multiple times."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper compares MDAgent-assisted workflow vs. traditional manual methods (Fig. 4a,b). For code generation, fine-tuned LAMMPSLLM is compared against 'other large models of similar size that had not been fine-tuned' (Fig. 4c)."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The specific baseline models compared in Fig. 4c are not clearly named in the paper text. Without knowing which models were compared, it is impossible to assess whether they are contemporary and competitive."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "MDAgent has multiple components (Manager, Planner, Worker, Evaluator, RAG, Tools). While Worker and Evaluator are evaluated separately, there is no systematic ablation removing components to measure their individual contributions."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Multiple evaluation metrics are used: task completion time (Fig. 4a), expert satisfaction scores (Fig. 4b), code generation quality scores (Fig. 4c), evaluator accuracy (Fig. 4d), MAE (Fig. 4e), and MSE (Fig. 4f)."
     95       },
     96       "human_evaluation": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Human experts evaluated both the generated scripts (scoring 1-10) and the overall MDAgent workflow (satisfaction scores). Experts rated task assistance and code quality across multiple tasks."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "For the LEQS-Dataset: 'A random subset of the LEQS-Dataset will be used for fine-tuning... with a separate random subset designated for testing to ensure no overlap between the two.'"
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Results are broken down across four thermodynamic tasks (heat capacity, lattice constant, melting point, thermal expansion) and separately for Worker evaluation (Fig. 4c), Evaluator evaluation (Fig. 4d-f), and overall assistance (Fig. 4a-b)."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "No specific failure cases are shown or analyzed. The paper acknowledges that MDAgent is 'semi-automated' but does not discuss where or how the system fails on specific tasks."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "Every experiment shows positive results. No configurations that failed, approaches that were abandoned, or conditions where MDAgent performed worse than baselines are reported."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The abstract claims a 42.22% time reduction (supported by Fig. 4a), improved code generation (supported by Fig. 4c), and improved review capabilities (supported by Fig. 4d-f). The claims are generally supported by the presented results."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper claims MDAgent 'significantly improves' capabilities and 'reduces' time — these are causal claims. The study compares with/without MDAgent but with no randomization, no control for confounds (e.g., task familiarity, order effects), and an unspecified number of experts."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper claims MDAgent as 'a general paradigm that can be extended to text-to-code applications in the field of materials science' and states it 'possesses inherent scalability.' However, only 4 thermodynamic tasks on 2 simple materials (copper, diamond) using LAMMPS were tested — far too narrow to support these general claims."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "No alternative explanations are discussed. The time savings could be due to novelty effects, task ordering, or expert familiarity. The code quality improvements from fine-tuning could be due to task-specific memorization. None of these are considered."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper measures expert satisfaction scores and task completion time, then frames these as evidence that MDAgent 'effectively assist[s] experts' and has 'potential applications in the field of materials science.' The gap between limited proxy measurements and broad effectiveness claims is not acknowledged."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The paper references 'Qwen' and 'ChatGLM' without exact version numbers or snapshot dates. The base model used for fine-tuning is described only as 'open-source large models.' No specific model version (e.g., Qwen-7B-v1.0) is stated."
    154       },
    155       "prompts_provided": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "No actual prompt text is provided. The paper describes the roles of Manager, Worker, and Evaluator in natural language but does not include the actual prompts or system instructions used."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "QLoRA fine-tuning with Unsloth framework is mentioned but no specific hyperparameters (learning rate, temperature, top-p, LoRA rank, epochs, batch size) are reported."
    164       },
    165       "scaffolding_described": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The agent architecture is described in detail: Manager receives inputs and coordinates tasks, Planner decomposes objectives, Worker generates scripts, Evaluator provides feedback via a Reflexion-inspired loop with score thresholds. RAG and Tool modules are also described. Fig. 2a provides an architecture diagram."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "For LSCF-Dataset: data gathered from official docs, papers, and open-source projects at 1:2:2 ratio, screened for correctness, annotated with three-part structure, converted to Alpaca format. For LEQS-Dataset: tasks designed by senior scientists, scripts generated by LLM, then evaluated by experts using structured rubric."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "There is no dedicated limitations section. The Discussion mentions that MDAgent is 'a semi-automated intelligent assistant' due to LLM limitations, but this is a single paragraph embedded in the discussion, not a substantive limitations section."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No threats to validity are discussed. The paper does not address potential confounds such as small expert sample, order effects, task selection bias, or evaluator subjectivity."
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The paper does not explicitly state what the results do NOT show. While it mentions LAMMPS as 'a case study,' it simultaneously claims broad scalability without stating specific boundaries or untested scenarios."
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The paper states 'The code and datasets supporting the findings of this study are... publicly available at https://github.com/FredericVAN/PKU_MDAgent.' Both datasets (LSCF and LEQS) are referenced as available."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "LSCF-Dataset: 167 scripts from official docs, papers, and open-source at 1:2:2 ratio, with 127 simulation and 40 modeling scripts. LEQS-Dataset: tasks designed by senior scientists covering thermal expansion, conductivity, density, and phase transitions."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The paper mentions 'several experts' and 'multiple experts in materials science' but provides no details on how many experts participated, how they were recruited, their qualifications, or whether recruitment could introduce bias."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The data pipeline is documented for both datasets: LSCF goes through collection → screening/validation → annotation (3-part structure) → Alpaca format conversion. LEQS goes through task design → LLM generation → expert scoring with rubric → train/test split."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Acknowledgements state: 'We gratefully acknowledge the support of the project \"Key Technologies Research for Data Space Application Construction Oriented towards Computational Reproduction\"' and 'supported by National Key Laboratory of Data Space Technology and System.'"
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "All author affiliations are listed: Peking University School of Software and Microelectronics, National Key Laboratory of Data Space Technology and System, Advanced Institute of Big Data, and Institute of Information Engineering (CAS)."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Funding is from the National Key Laboratory of Data Space Technology and System, a government-affiliated research lab. They have no apparent financial stake in the outcome of LAMMPS code generation results."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "The paper states: 'The authors declare no competing interests.'"
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "The base LLM's training data cutoff is never stated. The models fine-tuned (described only as 'open-source large models') could have pre-training data that includes LAMMPS scripts or solutions to the test tasks."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "The paper separates fine-tuning and test subsets of LEQS-Dataset, but does not discuss whether the base model's pre-training data might contain LAMMPS scripts, documentation, or solutions similar to the test tasks."
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No discussion of whether the LAMMPS scripts, documentation, or similar thermodynamic problems used in evaluation were in the base model's pre-training data. Given that LAMMPS documentation is publicly available online, contamination risk is non-trivial."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "No pre-registration is mentioned. The expert study evaluating MDAgent was not pre-registered on any platform."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": true,
    262         "answer": false,
    263         "justification": "No IRB or ethics board approval is mentioned despite the study involving human expert participants completing tasks and providing evaluations."
    264       },
    265       "demographics_reported": {
    266         "applies": true,
    267         "answer": false,
    268         "justification": "Participants are described only as 'materials science experts' and 'senior materials scientists.' No demographics (number, experience level, affiliation, gender) are reported."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "No inclusion or exclusion criteria are stated for expert participants. The paper simply references 'several experts' and 'multiple experts' without describing how they were selected."
    274       },
    275       "randomization_described": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No randomization procedure is described for the comparison of MDAgent-assisted vs. manual task completion. It is unclear whether task order or assignment was randomized."
    279       },
    280       "blinding_described": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No blinding is described. For the assistant evaluation, experts obviously knew which method they were using. For code evaluation (Fig. 4c), it is not stated whether evaluators were blind to which model generated each script."
    284       },
    285       "attrition_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No attrition or dropout information is reported. The initial and final number of expert participants is not stated."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No inference cost, tokens consumed, or per-task compute time is reported. The 42.22% time reduction refers to human task time, not computational cost."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No GPU hours, training time, total compute budget, or hardware specifications are reported for fine-tuning or inference."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "No mention of multiple random seeds or sensitivity analysis. It is unclear whether fine-tuning or generation experiments were repeated across seeds."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "The number of experimental runs is never stated. Results are presented without indicating how many times each experiment was performed."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "No hyperparameter search budget is reported. QLoRA fine-tuning is mentioned but the number of configurations tried, search method, or compute spent on tuning is not discussed."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "No description of how the final model configuration was selected. Only the best results are presented with no explanation of the selection process."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "Multiple comparisons are made across models, tasks, and metrics, but no statistical tests are performed at all, let alone corrections for multiple comparisons."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The authors evaluate their own MDAgent system and fine-tuned models. No acknowledgment of self-evaluation bias, and no independent evaluation is included."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "Performance is not reported as a function of compute budget. The fine-tuned model may use different compute than baselines, but this is not discussed or controlled for."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "The LEQS-Dataset evaluation uses expert scores (1-10) based on a custom rubric. No discussion of whether this scoring rubric has been validated or whether it actually measures the capabilities claimed."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "When comparing fine-tuned vs. base models for code generation (Fig. 4c), it is not explicitly stated whether the same scaffold/prompting setup was used. The scaffold confound is not discussed."
    348       }
    349     },
    350     "data_leakage": {
    351       "temporal_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of temporal leakage. The base models may have been trained on LAMMPS documentation and scripts that are similar to the evaluation tasks."
    355       },
    356       "feature_leakage_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether the evaluation setup leaks information. The LEQS task descriptions may contain hints that artificially improve generation quality."
    360       },
    361       "non_independence_addressed": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No discussion of whether fine-tuning and test data share structural similarities (e.g., similar LAMMPS patterns, same material types, similar script structures from the same domain)."
    365       },
    366       "leakage_detection_method": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "No leakage detection or prevention method is applied. No overlap analysis, canary strings, or decontamination is mentioned beyond the basic train/test split."
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "MDAgent reduces average task completion time by 42.22% compared to traditional manual methods.",
    376       "evidence": "Fig. 4a shows task elapsed time comparison between MDAgent-assisted and manual approaches across LAMMPS tasks. The specific percentage is stated in the abstract.",
    377       "supported": "weak"
    378     },
    379     {
    380       "claim": "Fine-tuned LAMMPSLLM outperforms other large models of similar size on LAMMPS script generation.",
    381       "evidence": "Fig. 4c shows expert evaluation scores for mainstream and fine-tuned models. Experts scored generated scripts on a 1-10 scale using the LEQS evaluation rubric.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Fine-tuning reduces the MAE and MSE of the LLM evaluator's scores compared to human expert scores.",
    386       "evidence": "Fig. 4e and 4f show MAE and MSE improvements after fine-tuning for the LammpsEvaluator.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "MDAgent can effectively assist novice materials scientists in solving LAMMPS-related problems.",
    391       "evidence": "Fig. 4b shows expert satisfaction ratings. Four demonstration tasks produced thermodynamic values near theoretical references (Fig. 3).",
    392       "supported": "weak"
    393     },
    394     {
    395       "claim": "MDAgent's methodology possesses inherent scalability and can be readily transferred to other computational materials science tasks.",
    396       "evidence": "Claimed based on 'established scaling law theories' (refs 31-35). No empirical evidence for transferability is provided — only 4 LAMMPS thermodynamic tasks on 2 materials were tested.",
    397       "supported": "unsupported"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "Unspecified sample size for expert study",
    403       "detail": "The number of expert participants is never stated — only 'multiple experts' and 'several experts.' This makes it impossible to assess the statistical power or reliability of the human evaluation results, including the headline 42.22% time reduction claim."
    404     },
    405     {
    406       "flag": "No statistical tests despite 'significance' language",
    407       "detail": "The paper repeatedly uses words like 'significantly' (e.g., 'significantly improves,' 'significantly lower') but performs no statistical tests. The 42.22% time reduction has no confidence interval or p-value."
    408     },
    409     {
    410       "flag": "No error bars or uncertainty quantification",
    411       "detail": "All results in Fig. 4 are presented as point estimates without error bars, standard deviations, or confidence intervals. It is unclear whether experiments were repeated."
    412     },
    413     {
    414       "flag": "Overclaiming from narrow evidence base",
    415       "detail": "The paper claims 'inherent scalability' and a 'general paradigm' for materials science text-to-code based on only 4 thermodynamic tasks with 2 simple materials (copper, diamond) using a single software package (LAMMPS)."
    416     },
    417     {
    418       "flag": "Unidentified baseline models",
    419       "detail": "The paper compares against 'other large models of similar size' and 'mainstream models' (Fig. 4c) but does not clearly name which specific models were compared, making the comparison unverifiable."
    420     },
    421     {
    422       "flag": "Tiny fine-tuning dataset",
    423       "detail": "The LSCF-Dataset contains only 167 scripts. While the paper cites references claiming this is sufficient, the breadth of claims about materials science generalization is not supported by this dataset size."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "GPT-4 Technical Report",
    429       "authors": ["OpenAI"],
    430       "year": 2023,
    431       "arxiv_id": "2303.08774",
    432       "doi": "10.48550/arXiv.2303.08774",
    433       "relevance": "Foundational LLM whose code generation capabilities underpin much of the agentic AI programming research space."
    434     },
    435     {
    436       "title": "A survey on large language model based autonomous agents",
    437       "authors": ["L. Wang"],
    438       "year": 2024,
    439       "relevance": "Comprehensive survey of LLM-based agent architectures directly relevant to the agent design patterns evaluated in this survey."
    440     },
    441     {
    442       "title": "Autogen: Enabling Next-gen LLM Applications Via Multi-agent Conversation",
    443       "authors": ["Q. Wu"],
    444       "year": 2024,
    445       "relevance": "Multi-agent conversation framework relevant to agent orchestration and code generation workflows."
    446     },
    447     {
    448       "title": "Augmenting large language models with chemistry tools",
    449       "authors": ["M. A. Bran"],
    450       "year": 2024,
    451       "doi": "10.1038/s42256-024-00832-8",
    452       "relevance": "ChemCrow demonstrates LLM tool-use for scientific domain tasks, directly comparable to MDAgent's approach for materials science."
    453     },
    454     {
    455       "title": "Honeycomb: A Flexible LLM-based Agent System for Materials Science",
    456       "authors": ["H. Zhang"],
    457       "year": 2024,
    458       "arxiv_id": "2409.00135",
    459       "relevance": "LLM agent system for materials science without fine-tuning, directly comparable to the fine-tuning approach taken in this paper."
    460     },
    461     {
    462       "title": "Agent-as-a-Judge: Evaluate Agents with Agents",
    463       "authors": ["M. Zhuge"],
    464       "year": 2024,
    465       "arxiv_id": "2410.10934",
    466       "relevance": "Framework for using LLM agents as evaluators, relevant to the LammpsEvaluator component and AI evaluation methodology."
    467     },
    468     {
    469       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    470       "authors": ["N. Shinn"],
    471       "year": 2023,
    472       "arxiv_id": "2303.11366",
    473       "relevance": "Self-reflection mechanism for LLM agents that inspired MDAgent's Worker-Evaluator feedback loop."
    474     },
    475     {
    476       "title": "Auto-GPT for Online Decision Making: Benchmarks and Additional Opinions",
    477       "authors": ["H. Yang"],
    478       "year": 2023,
    479       "arxiv_id": "2306.02224",
    480       "relevance": "Autonomous AI execution framework relevant to the agentic workflow paradigm."
    481     },
    482     {
    483       "title": "TaskWeaver: A Code-First Agent Framework",
    484       "authors": ["B. Qiao"],
    485       "year": 2024,
    486       "arxiv_id": "2311.17541",
    487       "relevance": "Code-first agent framework for translating user requests into executable code, directly relevant to code generation agent design."
    488     },
    489     {
    490       "title": "ChatMOF: An artificial intelligence system for predicting and generating metal-organic frameworks using large language models",
    491       "authors": ["Y. Kang", "J. Kim"],
    492       "year": 2024,
    493       "doi": "10.1038/s41467-024-48998-4",
    494       "relevance": "LLM system for materials science (metal-organic frameworks), demonstrating AI-assisted scientific code/structure generation."
    495     },
    496     {
    497       "title": "Large-language-model-based AI agent for organic semiconductor device research",
    498       "authors": ["Q. Zhang"],
    499       "year": 2024,
    500       "doi": "10.1002/adma.202405163",
    501       "relevance": "LLM-based agent for domain-specific scientific research, closely paralleling MDAgent's approach in a different materials domain."
    502     }
    503   ],
    504   "engagement_factors": {
    505     "practical_relevance": {
    506       "score": 1,
    507       "justification": "Useful only for materials scientists working with LAMMPS thermodynamic simulations; very narrow domain applicability."
    508     },
    509     "surprise_contrarian": {
    510       "score": 0,
    511       "justification": "Confirms the expected finding that fine-tuned LLMs can help with domain-specific code generation."
    512     },
    513     "fear_safety": {
    514       "score": 0,
    515       "justification": "No safety or security concerns raised; the domain is scientific simulation."
    516     },
    517     "drama_conflict": {
    518       "score": 0,
    519       "justification": "No controversy, no challenges to existing claims or institutions."
    520     },
    521     "demo_ability": {
    522       "score": 1,
    523       "justification": "GitHub repository exists but requires LAMMPS domain expertise and specific setup to try."
    524     },
    525     "brand_recognition": {
    526       "score": 0,
    527       "justification": "From Peking University, not a widely recognized AI lab; published in Scientific Reports, a broad-scope journal."
    528     }
    529   }
    530 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs