scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31172B)
      1 {
      2   "paper": {
      3     "title": "Hypothesis Generation for Materials Discovery and Design Using Goal-Driven and Constraint-Guided LLM Agents",
      4     "authors": [
      5       "Shrinidhi Kumbhar",
      6       "Venkatesh Mishra",
      7       "Kevin Coutinho",
      8       "Divij Handa",
      9       "Ashif Iquebal",
     10       "Chitta Baral"
     11     ],
     12     "year": 2025,
     13     "venue": "North American Chapter of the Association for Computational Linguistics (NAACL)",
     14     "arxiv_id": "2501.13299",
     15     "doi": "10.48550/arXiv.2501.13299"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "Integrating knowledge graphs and iterative critic feedback into an LLM-based hypothesis generation framework (ACCELMAT) improves both closeness to ground truth (80% vs 70% baseline) and quality scores (89% vs 79.67%) on the authors' MATDESIGN benchmark of 50 materials science problems. The temporal design using January 2024 papers addresses contamination relative to late-2023 model training cutoffs. Human expert evaluation by 4 PhD students qualitatively paralleled the automated LLM-based evaluation rankings, though no inter-rater reliability statistics were reported.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Footnote 1 provides a GitHub URL: 'https://github.com/shri071/Hypothesis-Generation-for-Materials-Discovery-and-Design-Using-Goal-Driven-and-Constraint-Guided-LLM'. The abstract also states 'Data and code are available.'"
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The MATDESIGN dataset is released alongside the code at the same GitHub repository per footnote 1."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No requirements.txt, Dockerfile, conda environment file, or environment setup section is mentioned in the paper. Only the LLM model names are listed."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper provides full prompts in the appendices but does not include step-by-step reproduction instructions, scripts to replicate experiments, or a README with commands."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Results in Figure 2 and Section 7 are reported as point estimates (e.g., '70%', '80%', '89%') with no confidence intervals, error bars, or uncertainty quantification."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "Claims like 'a 6.67% improvement over the feedback-only setup' are made by comparing raw percentages without any statistical significance tests (no p-values, t-tests, or bootstrap tests)."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper reports improvements with baseline context, e.g., 'Closeness score of 73.33%, a 3.33% improvement' over 70%, and 'Quality score of 85.67%, marking a 6% increase over the feedback-free setup' (Section 7). Absolute and relative differences are provided."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The dataset contains 50 instances from 50 papers. The Limitation section acknowledges 'its size (50 papers) may not fully capture the diversity of materials science research' but provides no power analysis or formal justification for this sample size."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No standard deviations, variance across runs, or spread measures are reported. Results appear to be from single experimental runs."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Section 6.1 establishes the 'Hypotheses Generation without Feedback from Critics' as a baseline. Three configurations are compared progressively (no feedback, with feedback, with KG and feedback)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "Table 1 compares ACCELMAT with LLMatDesign, ChemReasoner, and SciAgents only on qualitative features (diverse materials, tool-free, etc.), not on actual performance. No external system is run on MATDESIGN for quantitative comparison."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The three configurations progressively add components: (1) HGA + CA without feedback, (2) + iterative feedback with SA, (3) + knowledge graph. This functions as a progressive ablation study showing each component's contribution."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Two primary metrics are used: Closeness (with 3 sub-metrics: Concept Overlap, Property Overlap, Keyword Matching) and Quality (with 6 sub-metrics: Alignment, Plausibility, Innovation, Testability, Feasibility, Impact). See Section 5."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Section 7.1: '42 suggestions, encompassing all three configurations, were independently evaluated by four PhD students in Materials Science. Their assessments employed the same evaluation metrics used by the automated system.'"
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "All 50 MATDESIGN instances appear to be used for evaluation. No train/dev/test split is described, and it is unclear whether any instances were used during system development or prompt tuning."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Figure 2 provides per-criterion breakdowns for both Closeness (3 criteria) and Quality (6 criteria) across all three configurations."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 7 discusses specific failure modes for each configuration: 'Lack of Consensus,' 'Incomplete Adherence to Constraints,' 'Bias in Material and Method Selection,' and for KG: 'information retrieved from the knowledge graph... tends to focus on fundamental materials.'"
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper reports that 'hypotheses remained focused on well-established methodologies, with limited exploration of unconventional solutions' even with feedback, and that KG information sometimes produced 'broad and generalized suggestions that lack the specificity needed for practical implementation.'"
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims about curating a novel dataset, testing LLM-based agents, and proposing a scalable evaluation metric are all supported by Sections 3, 4-6, and 5 respectively. Claims are hedged appropriately ('explore the potential,' 'aim to advance')."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper claims feedback and KG integration 'enhance performance.' The progressive addition of components (no feedback → feedback → KG + feedback) with the same base model (GPT-4o) constitutes controlled single-variable manipulation adequate for these causal claims."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title claims 'Materials Discovery and Design' generally, and the abstract says the framework aims 'to advance future research in accelerating materials discovery and design with LLMs.' However, results are from 50 specific instances from journals in a single month (January 2024), with a single hypothesis generator (GPT-4o). These scope limitations are not acknowledged in the claims."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No alternative explanations are considered for why KG+feedback outperforms other configurations. For example, the KG configuration provides more context tokens to the generator — the improvement could be due to additional context rather than structured knowledge. This confound is not discussed."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper measures LLM-scored Closeness and Quality metrics and frames these as evaluating 'materials discovery and design.' The gap between LLM evaluation scores (a proxy) and actual scientific utility of hypotheses is not discussed. The BERTScore comparison in Appendix G actually contradicts the LLM evaluator rankings, but this discrepancy is dismissed rather than explored."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "Models are identified by marketing names: 'GPT-4o,' 'Claude-3.5-Sonnet,' 'Gemini-1.5-Flash,' 'OpenAI-o1-preview.' No API version identifiers, snapshot dates, or specific model IDs (e.g., 'gpt-4o-2024-05-13') are provided."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Full prompt texts are provided in Appendices C and D, including the hypothesis generator prompt (C.3), critic feedback prompt (C.4), summarizer prompt (C.5), and evaluation prompts (Appendix E). These contain actual prompt text, not just descriptions."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No temperature, top-p, max tokens, or other sampling parameters are reported for any of the LLMs used (GPT-4o, Claude-3.5-Sonnet, Gemini-1.5-Flash, o1-preview)."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The multi-agent framework is described in detail in Section 4: HGA, three CAs, SA, and EA roles are defined. Algorithm 1 provides pseudocode for the iterative feedback loop. The knowledge graph integration workflow is described in Section 6.3."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "Section 3 describes the dataset structure (goals, constraints, materials, methods from 50 papers) and mentions expert assistance, but does not document the full pipeline: how many papers were initially considered, what specific inclusion/exclusion criteria were used beyond 'prominent journals' and 'January 2024,' or how expert extractions were validated."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "A dedicated 'Limitation' section is present, discussing dataset size constraints, LLM feedback reliability, hallucination risks, and dependency on human-provided constraints."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The limitations are specific to this study: 'its size (50 papers) may not fully capture the diversity,' 'even unanimous agreement among state-of-the-art LLM-based critics does not guarantee scientific accuracy,' and 'the risk of hallucinated or flawed suggestions remains a challenge.'"
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound the scope to specific material types, application domains, or settings where the approach may not generalize. The Limitation section discusses expansion opportunities but not explicit exclusions from claims."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "The MATDESIGN dataset and code are released on GitHub (footnote 1), enabling independent verification of the benchmark data."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 3 describes data collection: extracted from 50 research papers published from January 2024 in prominent journals (Nature, Nature Communications, Progress in Organic Coatings), with materials science expert assistance for all extractions."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "For the human evaluation (Section 7.1), '42 suggestions... were independently evaluated by four PhD students in Materials Science' but no recruitment details are provided — how the 4 PhD students were selected, from which lab, or whether this introduces evaluation bias."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The paper states information was extracted from 50 papers but does not document the pipeline with counts at each stage (e.g., how many journals were searched, how many papers were considered, filtering criteria at each step, or inter-annotator agreement for the expert extractions)."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Acknowledgement section: 'This research was supported by the Engineering Research and Development Center - Information Technology Laboratory (ERDC-ITL) under Contract No. W912HZ24C0022.' Also acknowledges ASU Research Computing resources."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "All authors are listed with Arizona State University affiliation. They use third-party models (GPT-4o, Claude, Gemini) without being affiliated with those companies."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "ERDC-ITL is a government research lab with no direct financial stake in whether particular LLM-based hypothesis generation approaches outperform others. The disclaimer states 'Any opinions, findings and conclusions... do not necessarily reflect the views of the ERDC-ITL.'"
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "Footnote 2: 'Llama-3.1-70B (Model Card) and GPT-4o (Documentation) have knowledge cutoffs of December 2023 and October 2023 respectively.'"
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": true,
    242         "justification": "Section 3: 'The selection of publications from January 2024 is essential to ensure that the ground truth information... is not present in the training corpus of LLMs we use, which have a training cutoff of late 2023.' This temporal design addresses potential overlap."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "The entire benchmark design is built around contamination avoidance: MATDESIGN uses papers published after model training cutoffs. Section 2.2 and Table 2 explicitly list 'No Data Leakage' as a feature distinguishing MATDESIGN from prior benchmarks."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "No mention of pre-registration for the human evaluation study with 4 PhD students."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": true,
    258         "answer": false,
    259         "justification": "No mention of IRB or ethics board approval for the human evaluation involving 4 PhD students."
    260       },
    261       "demographics_reported": {
    262         "applies": true,
    263         "answer": false,
    264         "justification": "The only demographic information for human evaluators is 'four PhD students in Materials Science.' No experience level, years in program, gender, or other characterization is provided."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": true,
    268         "answer": false,
    269         "justification": "No criteria are stated for selecting the 4 PhD student evaluators — why these students, what qualifications were required, etc."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "The human evaluation is an expert rating task, not an experimental study with treatment/control conditions requiring randomization."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "The human evaluation is an expert rating task assessing output quality, not an experimental study where blinding would be applicable."
    280       },
    281       "attrition_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No mention of whether all 4 PhD students completed all evaluations or if any dropout occurred."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "The paper mentions wanting to 'restrict the cost per instance and the runtime' (Section 7) but does not report actual API costs, tokens consumed, or wall-clock time for any configuration."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "The paper acknowledges use of 'ChatGPT enterprise version' and ASU computing resources but does not quantify total API spend, compute hours, or hardware used."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No mention of multiple random seeds or sensitivity analysis. Results appear to be from single runs of each configuration."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is never stated. It is unclear whether results represent single runs or averages."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No hyperparameter search is described. The number of hypotheses (20), number of feedback cycles (5), and number of critics (3) appear to be fixed without justification or search."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": true,
    318         "justification": "All three configurations are predefined and all results are reported; there is no cherry-picking from a larger set of configurations."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": false,
    322         "answer": false,
    323         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors created MATDESIGN, designed ACCELMAT, and developed the evaluation metric, then evaluated their system on their benchmark with their metric. This self-evaluation bias is not acknowledged."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The KG+feedback configuration requires significantly more API calls (knowledge graph queries, additional feedback cycles) than the no-feedback baseline, but performance is not normalized by compute budget."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The paper does not discuss whether MATDESIGN actually measures the ability to generate viable hypotheses for materials discovery. The Closeness metric measures similarity to known solutions, which may penalize truly novel hypotheses — this construct validity issue is not addressed."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "The scaffold (feedback loops, KG integration) IS the variable being tested across configurations, with the same base model (GPT-4o) throughout. The scaffold is the thing being evaluated, not a confound."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": true,
    350         "justification": "The benchmark uses papers from January 2024, after model training cutoffs of late 2023, explicitly designed to prevent temporal leakage. Section 3 and footnote 2 discuss this."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "The Evaluation Agent (o1-preview) receives both generated hypotheses and ground truth for scoring. Whether o1-preview might have encountered the ground truth papers (published January 2024, potentially within o1-preview's training data given its later release) is not discussed."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "Whether the 50 MATDESIGN papers share structural similarities (e.g., overlapping domains, similar material types, common journal conventions) that could inflate performance estimates is not discussed."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": true,
    365         "justification": "A temporal split (January 2024 papers vs late 2023 training cutoffs) is used as a concrete decontamination method. Table 2 lists 'No Data Leakage' as a verified property of MATDESIGN."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Integrating knowledge graphs and critic feedback (KG+feedback) achieves the best performance with 80% Closeness and 89% Quality scores.",
    372       "evidence": "Figure 2 and Section 7: Closeness scores of 70% (no feedback), 73.33% (feedback), 80% (KG+feedback). Quality scores of 79.67%, 85.67%, 89% respectively.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "Critic feedback increases consensus among LLM critics from 11/20 to 19/20 agreed hypotheses.",
    377       "evidence": "Table 5 shows average hypotheses agreed by all critics: 11 (without feedback), 18 (with feedback), 19 (with KG and feedback).",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "The proposed LLM-based evaluation metric aligns with human expert judgments.",
    382       "evidence": "Section 7.1: 'the human evaluations paralleled the automated results' — configuration rankings (no feedback < feedback < KG+feedback) matched between human and automated evaluation.",
    383       "supported": "weak"
    384     },
    385     {
    386       "claim": "MATDESIGN benchmark ensures no data leakage by using papers published after model training cutoffs.",
    387       "evidence": "Section 3 and footnote 2: Papers from January 2024 vs model cutoffs of October/December 2023. Table 2 lists 'No Data Leakage' as verified.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Closed-source models consistently outperform open-source counterparts on both Closeness and Quality metrics.",
    392       "evidence": "Appendix A and Figure 3: Comparison using LLaMA-3.1 70B, Gemma-2-27B, Mixtral-8x22B (open) vs GPT-4o, Claude-3.5-Sonnet, Gemini-1.5-Flash (closed).",
    393       "supported": "weak"
    394     }
    395   ],
    396   "red_flags": [
    397     {
    398       "flag": "LLM-as-judge without adequate validation",
    399       "detail": "OpenAI o1-preview serves as the primary evaluation agent, scoring hypotheses on both Closeness and Quality. The BERTScore comparison (Appendix G) contradicts the LLM evaluator rankings — KG+feedback scores lowest (50.30%) on BERTScore but highest on the LLM metric. This discrepancy is dismissed rather than investigated, undermining confidence in the evaluation."
    400     },
    401     {
    402       "flag": "Tiny sample size with no statistical tests",
    403       "detail": "All conclusions are drawn from 50 benchmark instances with no confidence intervals, significance tests, or repeated runs. Reported differences (e.g., 3.33% and 6.67% improvements) could easily be within noise for N=50."
    404     },
    405     {
    406       "flag": "Self-evaluation loop",
    407       "detail": "The authors created the benchmark (MATDESIGN), the system (ACCELMAT), and the evaluation metric, then evaluated their system on their benchmark with their metric. No independent evaluation exists."
    408     },
    409     {
    410       "flag": "Anecdotal human evaluation",
    411       "detail": "Only 4 PhD students evaluated 42 suggestions with no inter-rater reliability statistics (Kappa, ICC), no blinding information, no recruitment details. The alignment between human and automated evaluation is described qualitatively ('paralleled') without quantitative comparison."
    412     },
    413     {
    414       "flag": "No cost reporting despite multi-LLM system",
    415       "detail": "The system uses GPT-4o, Claude-3.5-Sonnet, Gemini-1.5-Flash, and o1-preview across multiple iterative cycles per instance. Total API cost is never reported, making practical feasibility impossible to assess."
    416     }
    417   ],
    418   "cited_papers": [
    419     {
    420       "title": "LLMatDesign: Autonomous Materials Discovery with Large Language Models",
    421       "authors": ["Shuyi Jia", "Chao Zhang", "Victor Fung"],
    422       "year": 2024,
    423       "arxiv_id": "2406.13163",
    424       "relevance": "LLM-based multi-agent framework for materials design hypothesis generation, directly comparable to ACCELMAT."
    425     },
    426     {
    427       "title": "ChemReasoner: Heuristic Search over a Large Language Model's Knowledge Space Using Quantum-Chemical Feedback",
    428       "authors": ["Henry W Sprueill", "Carl Edwards", "Khushbu Agarwal"],
    429       "year": 2024,
    430       "arxiv_id": "2402.10980",
    431       "relevance": "LLM agent using heuristic search and chemical feedback for catalyst discovery, a comparable domain-specific LLM agent approach."
    432     },
    433     {
    434       "title": "SciAgents: Automating Scientific Discovery through Multi-Agent Intelligent Graph Reasoning",
    435       "authors": ["Alireza Ghafarollahi", "Markus J Buehler"],
    436       "year": 2024,
    437       "arxiv_id": "2409.05556",
    438       "relevance": "Multi-agent LLM framework for scientific discovery using knowledge graphs, similar architecture to ACCELMAT."
    439     },
    440     {
    441       "title": "Empowering Biomedical Discovery with AI Agents",
    442       "authors": ["Shanghua Gao", "Ada Fang", "Yepeng Huang"],
    443       "year": 2024,
    444       "relevance": "Multi-agent AI framework for biomedical discovery, examines LLM agent architectures for hypothesis generation in scientific domains."
    445     },
    446     {
    447       "title": "What Can Large Language Models Do in Chemistry? A Comprehensive Benchmark on Eight Tasks",
    448       "authors": ["Taicheng Guo", "Bozhao Nan", "Zhenwen Liang"],
    449       "year": 2023,
    450       "relevance": "ChemLLMBench: LLM benchmark for chemistry tasks, one of the baselines MATDESIGN is compared against."
    451     },
    452     {
    453       "title": "MaScQA: A Question Answering Dataset for Investigating Materials Science Knowledge of Large Language Models",
    454       "authors": ["Mohd Zaki", "NM Krishnan"],
    455       "year": 2023,
    456       "arxiv_id": "2308.09115",
    457       "relevance": "Materials science QA benchmark for LLMs, predecessor benchmark that MATDESIGN aims to improve upon."
    458     },
    459     {
    460       "title": "Honeycomb: A Flexible LLM-based Agent System for Materials Science",
    461       "authors": ["Huan Zhang", "Yu Song", "Ziyu Hou"],
    462       "year": 2024,
    463       "arxiv_id": "2409.00135",
    464       "relevance": "LLM-based agent system for materials science tasks, directly relevant to agentic AI for scientific discovery."
    465     },
    466     {
    467       "title": "MatExpert: Decomposing Materials Discovery by Mimicking Human Experts",
    468       "authors": ["Qianggang Ding", "Santiago Miret", "Bang Liu"],
    469       "year": 2024,
    470       "arxiv_id": "2410.21317",
    471       "relevance": "LLM-based materials discovery framework that decomposes tasks mimicking human expert workflows."
    472     },
    473     {
    474       "title": "MOOSE-Chem: Large Language Models for Rediscovering Unseen Chemistry Scientific Hypotheses",
    475       "authors": ["Zonglin Yang", "Wanhao Liu", "Ben Gao"],
    476       "year": 2024,
    477       "arxiv_id": "2410.07076",
    478       "relevance": "LLM framework for rediscovering chemistry hypotheses using literature and mutation algorithms."
    479     },
    480     {
    481       "title": "Structured Chemistry Reasoning with Large Language Models",
    482       "authors": ["Siru Ouyang", "Zhuosheng Zhang", "Bing Yan"],
    483       "year": 2023,
    484       "arxiv_id": "2311.09656",
    485       "relevance": "LLM-based structured reasoning for chemistry tasks, relevant to how LLMs handle scientific reasoning."
    486     },
    487     {
    488       "title": "Galactica: A Large Language Model for Science",
    489       "authors": ["Ross Taylor", "Marcin Kardas", "Guillem Cucurull"],
    490       "year": 2022,
    491       "arxiv_id": "2211.09085",
    492       "relevance": "Large language model specifically trained for scientific text, relevant to LLM capabilities in scientific domains."
    493     },
    494     {
    495       "title": "OpenAI o1 System Card",
    496       "authors": ["Aaron Jaech", "Adam Kalai", "Adam Lerer"],
    497       "year": 2024,
    498       "arxiv_id": "2412.16720",
    499       "relevance": "System card for o1-preview, the model used as the evaluation agent in this study."
    500     }
    501   ],
    502   "engagement_factors": {
    503     "practical_relevance": {
    504       "score": 1,
    505       "justification": "The framework requires multiple LLM API subscriptions and a knowledge graph; the paper acknowledges outputs aren't ready for immediate practical application by materials scientists."
    506     },
    507     "surprise_contrarian": {
    508       "score": 0,
    509       "justification": "Results confirm the expected pattern that adding feedback and structured knowledge improves LLM output quality."
    510     },
    511     "fear_safety": {
    512       "score": 0,
    513       "justification": "No AI safety, security, or risk concerns are raised."
    514     },
    515     "drama_conflict": {
    516       "score": 0,
    517       "justification": "No controversy or conflict in the findings."
    518     },
    519     "demo_ability": {
    520       "score": 1,
    521       "justification": "Code is released on GitHub but requires multiple API keys and domain expertise to run meaningfully."
    522     },
    523     "brand_recognition": {
    524       "score": 1,
    525       "justification": "Uses well-known models (GPT-4o, Claude, Gemini) but is from an academic lab (Arizona State University), not a major AI company."
    526     }
    527   }
    528 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs