scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21037B)
      1 {
      2   "paper": {
      3     "title": "Crystalyse: a multi-tool agent for materials design",
      4     "authors": ["Ryan Nduma", "Hyunsoo Park", "Aron Walsh"],
      5     "year": 2025,
      6     "venue": "",
      7     "doi": ""
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository provided: https://github.com/ryannduma/CrystaLyse.AI, also available on PyPI."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The TRINITY Gold dataset compositions are described as available in the code repository. Materials Project data is publicly accessible via API. ICSD data requires subscription but is a standard database."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is mentioned in the paper. The paper references PyPI installation but does not list library versions or dependency details."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The CLI commands (crystalyse discover, crystalyse) are mentioned but no detailed protocol for reproducing the experimental results is given."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The TRINITY benchmark reports 'mean ± 1 SD (n=3 runs)' for performance metrics across model-format pairs."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper compares SMACT vs GPT-4o vs Gemini on composition validity but uses no statistical significance tests to support claims of difference."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are reported with baseline context, e.g., 'GPT-4o improved from 64.5% to 89.4% (+25 percentage points)' and adversarial pass rate '86% from a 57% baseline'."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for why 2,087 compositions were chosen for TRINITY Gold, why 28 adversarial prompts were used for evaluation, or why 3 replicates were run."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Standard deviation reported across 3 runs for TRINITY benchmark: 'mean ± s.d.' notation used in confusion matrices and performance metrics."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "SMACT (domain tool) compared against GPT-4o and Gemini 2.0 Flash on composition validity. Adversarial testing uses a v0 baseline prompt at 57% pass rate."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "GPT-4o (2024-05-13) and Gemini 2.0-Flash are contemporary models used as baselines for composition validity."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The prompt engineering progression (v0 through v3) serves as an ablation showing the contribution of safety, provenance, and robustness components. The three operational modes (creative/adaptive/rigorous) are also compared."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Accuracy, precision, recall, false positive counts, and confusion matrices are reported for the TRINITY benchmark. Pass rates across multiple adversarial dimensions are also reported."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of the agent's outputs was conducted. The materials design tasks are evaluated by the authors but no structured human evaluation protocol is described."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The adversarial testing used 28 prompts for iterative refinement and a separate complete 70-prompt suite for independent validation. The TRINITY Gold dataset is a constructed benchmark not used for system development."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Performance broken down by adversarial dimension (hallucination, safety, legitimate use, sustainability, robustness) and by composition order (binary through senary) and format type."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Detailed failure analysis: disguised toxic requests (25% failure rate), high-energy materials ambiguity (25% failure), render gate false positives, LLM format sensitivity. Section 'Agent robustness and failure analysis' dedicated to this."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Reported that v1 prompt over-corrected (0% robustness), that render gate had false positives, that disguised toxic requests bypass safety 25% of the time, and that LLMs show precision collapse on composition tasks."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims about provenance enforcement eliminating hallucinations (supported by shadow validation), 86% adversarial pass rate from 57% baseline (supported by Figure 4), and open-source release (GitHub link provided) are all supported."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about provenance enforcement eliminating hallucinations are supported by the prompt version ablation (v0-v3) showing controlled addition of components. The shadow validation A/B testing provides causal evidence."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper describes Crystalyse as a 'prototype' initially focused on 'inorganic crystals', acknowledges limitations in property coverage (energy only, not electronic/optical/magnetic), and notes scope is bounded by accessible databases."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses whether LLM format sensitivity is due to tokenisation vs training corpus bias, investigating and ruling out tokenisation as the primary explanation (Supplementary Section S4). Limitations section discusses alternative architectural approaches."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Specific model snapshots provided: 'o3 (snapshot o3-2025-04-16)', 'o4-mini (snapshot o4-mini-2025-04-16)', 'GPT-4o (gpt-4o-2024-05-13)', 'Gemini 2.0-Flash (gemini-2.0-flash-exp-01-21)'."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "System prompts are described conceptually (v0-v3 evolution) but the actual prompt text is not provided in the paper. The paper says prompts are in supplementary sections but only describes their intent, not full text."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Temperature 0 stated for TRINITY benchmark. 'Default reasoning effort settings' stated for o3/o4-mini. Timeouts specified (60s SMACT, 300s Chemeleon, 600s MACE). Fuzzy matching tolerance ±0.001."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Detailed description of the agentic architecture: three-layer hierarchy (orchestration, execution, validation), MCP tool integration, render gate, provenance system, clarification engine, mode selection, memory architecture. Figures 1-2 illustrate the workflow."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "TRINITY Gold dataset construction documented: 1,500 ICSD positives and 587 MP negatives, deduplication on reduced formula, charge-neutral filtering, valid elements (Z≤118), no partial occupancies, stratified by composition order with counts (475/534/440/335/303)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Discussion section contains substantive limitations discussion: 'Limitations in our approach reflect the current toolset and model capabilities' with specific points about discovery scope, property coverage, and architectural constraints."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats discussed: ML force field provides only point estimates without uncertainty, discovery scope bounded by databases, electronic/optical/magnetic properties not covered, disguised toxic requests bypass safety 25% of the time, render gate false positives from pattern-matching limitations."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Explicit scope boundaries: 'initially focused on inorganic crystals', energy evaluations only (not electronic/optical/magnetic), bounded by accessible databases, and three specific principles about what was not shown."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Data stated as available in the code repository. TRINITY Gold dataset compositions and source identifiers (ICSD/MP IDs) retained for reproducibility."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "TRINITY Gold construction described in detail: 1,500 from ICSD release 2024.1, 587 from Materials Project January 2025 snapshot with Ehull > 0.5 eV/atom, tagged theoretical/never observed. Inclusion/exclusion criteria specified."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data sources are standard computational materials databases (ICSD, Materials Project)."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Pipeline documented: source selection, deduplication on reduced formula, charge-neutral filtering, element validation, stratified sampling by composition order with counts at each stage, five encoding formats generated."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Funding disclosed in Acknowledgements: EPSRC project EP/X037754/1, AIchemy hub EPSRC grants EP/Y028775/1 and EP/Y028759/1, including a summer studentship."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors affiliated with Department of Materials, Imperial College London. The system uses OpenAI models but authors have no stated affiliation with OpenAI."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "EPSRC is a UK government research council with no financial stake in the performance of Crystalyse or any particular AI model."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement found in the paper. Absence of disclosure is not the same as absence of conflict."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates GPT-4o and Gemini on the TRINITY benchmark but does not state the training data cutoff dates for these models."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether ICSD compositions or Materials Project data appeared in GPT-4o or Gemini training data. The format sensitivity finding hints at training corpus effects but contamination is not directly addressed."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "TRINITY Gold uses ICSD and Materials Project data, both publicly available before model training cutoffs. The paper discusses 'training corpus bias' but does not address whether models memorized specific compositions from training data."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Wall-clock times reported per mode: Creative 65-91s, Adaptive 172s, Rigorous 199-279s. Individual tool costs stated: SMACT <10ms, MACE 1-2s single-point up to ~1hr for MD. Shadow validation overhead 8.3%."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total API spend or GPU hours reported. Hardware described as 'consumer laptops to workstations' but specific hardware not quantified. No total computational budget for the study."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Provenance enforcement eliminated material-property hallucinations, with adversarial pass rate reaching 86% from a 57% baseline.",
    286       "evidence": "Shadow validation (n=50 prompts) found 0/50 unprovenanced direct material properties. Adversarial suite v0→v3 progression: 57%→71%→79%→86% (Figure 4, Supplementary S5-S7).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "SMACT achieves 90.8% accuracy with zero false positives on composition validity, while LLMs show precision collapse (77.8-78.8%).",
    291       "evidence": "TRINITY Gold benchmark (n=2,087 compositions, 3 runs per model-format pair) with confusion matrices in Figure 3. SMACT: 100% precision, 87.1% recall vs GPT-4o: 77.8% precision, 70.9% recall.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "LLM accuracy on composition validity is format-dependent, improving 20-25 percentage points with ICSD-style formatting.",
    296       "evidence": "GPT-4o improved from 64.5% to 89.4%, Gemini from 71.1% to 90.8% when switching from PyMatgen to ICSD format. Tokenisation analysis ruled out mid-element fragmentation as explanation (Supplementary S4).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Adaptive mode discovered a thermodynamically stable phase (Na4Mn2Si2O8) below the convex hull in the battery cathode task.",
    301       "evidence": "Reported as 7 meV/atom below convex hull in Task 2 results. However, this is a single run with no replication or uncertainty quantification on the energy calculation.",
    302       "supported": "moderate"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval", "case-study"],
    306   "key_findings": "Crystalyse is a provenance-enforced single-agent system for computational materials design that orchestrates composition screening (SMACT), crystal structure generation (Chemeleon), and ML force-field evaluation (MACE-MP0) via MCP. On a TRINITY Gold benchmark of 2,087 compositions, domain tools (SMACT) achieve 100% precision vs 78% for standalone LLMs, with LLM performance showing strong format dependence (+20-25pp with ICSD notation). Iterative prompt engineering improved adversarial robustness from 57% to 86% pass rate, with provenance enforcement eliminating material-property hallucinations in shadow validation testing.",
    307   "red_flags": [
    308     {
    309       "flag": "No contamination analysis for benchmark",
    310       "detail": "The TRINITY Gold benchmark uses publicly available ICSD and Materials Project data. GPT-4o and Gemini likely saw these compositions during training, but the paper does not address this despite discussing 'training corpus bias' in a different context."
    311     },
    312     {
    313       "flag": "Small adversarial test set",
    314       "detail": "System refinement was done on only 28 prompts (2 per category). The full 70-prompt suite is deferred to supplementary for 'independent validation' but results on it are not reported in the main paper."
    315     },
    316     {
    317       "flag": "Materials design tasks lack independent validation",
    318       "detail": "The three design tasks (quaternary oxides, Na-ion cathodes, indoor PV) produce plausible-sounding results but are not validated against DFT calculations or experimental data. The paper acknowledges these are 'plausible predictions' but presents them as demonstrations."
    319     }
    320   ],
    321   "cited_papers": [
    322     {
    323       "title": "ChemCrow: Augmenting large-language models with chemistry tools",
    324       "authors": ["A. M. Bran"],
    325       "year": 2023,
    326       "arxiv_id": "2304.05376",
    327       "relevance": "Seminal work on LLM tool augmentation for chemistry, directly comparable agentic approach."
    328     },
    329     {
    330       "title": "Autonomous chemical research with large language models",
    331       "authors": ["D. A. Boiko", "R. MacKnight", "B. Kline", "G. Gomes"],
    332       "year": 2023,
    333       "relevance": "Demonstrates autonomous LLM-driven chemical research (Coscientist), key prior work on scientific agents."
    334     },
    335     {
    336       "title": "DeepSeek-R1 incentivizes reasoning in LLMs through reinforcement learning",
    337       "authors": ["D. Guo"],
    338       "year": 2025,
    339       "relevance": "Reasoning LLM capabilities that underpin agentic scientific systems."
    340     },
    341     {
    342       "title": "Constitutional Classifiers: Defending against Universal Jailbreaks across Thousands of Hours of Red Teaming",
    343       "authors": ["M. Sharma"],
    344       "year": 2025,
    345       "arxiv_id": "2501.18837",
    346       "relevance": "Safety classifier approach referenced for future adversarial robustness improvements in agent systems."
    347     },
    348     {
    349       "title": "Why Do Multi-Agent LLM Systems Fail? A Comprehensive Taxonomy and Analysis",
    350       "authors": ["J. Zhang"],
    351       "year": 2025,
    352       "relevance": "Analysis of multi-agent LLM failure modes, motivates single-agent design choice in Crystalyse."
    353     },
    354     {
    355       "title": "Reducing Tool Hallucination via Reliability Alignment",
    356       "authors": ["L. Xu"],
    357       "year": 2024,
    358       "arxiv_id": "2412.04141",
    359       "relevance": "Addresses tool hallucination problem in LLM agents, directly relevant to provenance enforcement approach."
    360     },
    361     {
    362       "title": "Making Language Models Better Tool Learners with Execution Feedback",
    363       "authors": ["S. Qiao"],
    364       "year": 2024,
    365       "relevance": "Execution-time validation for LLM tool use, related to Crystalyse's runtime provenance approach."
    366     },
    367     {
    368       "title": "Kosmos: An AI Scientist for Autonomous Discovery",
    369       "authors": ["L. Mitchener"],
    370       "year": 2025,
    371       "relevance": "Computationally intensive multi-agent scientific discovery system, contrasted with Crystalyse's single-agent approach."
    372     },
    373     {
    374       "title": "AtomAgents: Alloy design and discovery through physics-aware multi-modal multi-agent artificial intelligence",
    375       "authors": ["A. Ghafarollahi", "M. J. Buehler"],
    376       "year": 2024,
    377       "arxiv_id": "2407.10022",
    378       "relevance": "Multi-agent materials design system, directly comparable to Crystalyse's single-agent approach."
    379     }
    380   ]
    381 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs