scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24565B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Hypothesis Generation for Materials Discovery and Design Using Goal-Driven and Constraint-Guided LLM Agents",
      6     "authors": [
      7       "Shrinidhi Kumbhar",
      8       "Venkatesh Mishra",
      9       "Kevin Coutinho",
     10       "Divij Handa",
     11       "Ashif Iquebal"
     12     ],
     13     "year": 2025,
     14     "venue": "North American Chapter of the Association for Computational Linguistics",
     15     "arxiv_id": "2501.13299",
     16     "doi": "10.48550/arXiv.2501.13299"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims of a curated novel dataset, LLM-based agent framework, and scalable evaluation metric are all delivered and described in the paper body.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper claims that adding critic feedback and knowledge graphs 'enhances performance,' but comparisons across three configurations are made without statistical significance tests, confidence intervals, or multiple runs, making causal inference unjustified.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper frames contributions broadly as 'accelerating materials discovery and design' but the benchmark is 50 papers from a single month (January 2024) in specific journals; claims of general applicability are not bounded to this narrow scope.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss alternative explanations for why KG+Feedback outperforms other configurations, such as the possibility that the knowledge graph simply retrieves terms that match ground truth vocabulary rather than improving reasoning quality.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper conflates LLM-rated 'Quality' scores and similarity to one specific ground truth solution with the broader claim of generating 'viable hypotheses that expedite materials discovery'; this distinction is not drawn explicitly.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "A dedicated 'Limitation' section is present, discussing dataset size (50 papers) and reliance on LLMs for feedback/critique.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The limitations section names specific concerns: dataset size may not capture diversity; unanimous LLM critic agreement does not guarantee scientific accuracy; hallucination risk from LLM critics.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what the results do not show — for example, there is no statement that the system cannot produce experimentally validated hypotheses or that results may not transfer beyond corrosion/coatings applications.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Funding is disclosed: 'Engineering Research and Development Center - Information Technology Laboratory (ERDC-ITL) under Contract No. W912HZ24C0022.'",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All five authors are affiliated with Arizona State University, clearly stated in the paper header.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "ERDC-ITL is a US Army research center with no financial stake in the LLM providers (OpenAI, Anthropic, Google) evaluated in the study.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) appears in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are defined: 'hypothesis' is operationalized as goal+constraint input producing material+method output; evaluation metrics (Closeness, Quality and their sub-components) are formally defined with rubrics.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Contributions are explicitly enumerated: (1) MATDESIGN benchmark, (2) ACCELMAT agentic framework, (3) scalable evaluation metric.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper compares ACCELMAT against LLMatDesign, ChemReasoner, and SciAgents in Table 1, and contrasts MATDESIGN against MaScQA and ChemLLMBench in Table 2, showing how this work differs.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Code and data are available at a GitHub link provided in the paper's first footnote.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The MATDESIGN dataset (50 entries) is released at the same GitHub repository.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements.txt, Dockerfile, or specific library/version specifications are mentioned anywhere in the paper.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "While detailed prompts are provided in the appendix, there are no step-by-step instructions for running the pipeline, including API setup, execution order, or parameter configuration.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results are reported as single point estimates (e.g., '80% Closeness score'); no confidence intervals or error bars appear anywhere.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to any comparisons across the three configurations.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "Percentage improvements are stated (e.g., '3.33% improvement'), but without variance measures these are not interpretable as effect sizes.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The 50-paper dataset size is justified on temporal grounds (post-cutoff) rather than statistical power analysis.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No standard deviation, variance, or confidence ranges are reported; results come from a single run on 50 test cases.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "The 'Hypotheses Generation without Feedback' configuration explicitly serves as the baseline for comparing the two enhanced configurations.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "The only baseline is a degraded version of their own system; despite Table 1 listing LLMatDesign, ChemReasoner, and SciAgents, no actual performance comparison against these systems is provided.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The three configurations (no feedback → feedback → feedback+KG) constitute an incremental ablation studying the effect of each added component.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Both Closeness (Concept Overlap, Property Overlap, Keyword Matching) and Quality (6 criteria: Alignment, Plausibility, Innovation, Testability, Feasibility, Impact) are used.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Four PhD students in Materials Science independently evaluated 42 suggestions across the three configurations using the same evaluation rubric as the automated system.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "The 50-paper MATDESIGN benchmark is held out from LLM training by design (January 2024 papers, beyond all models' knowledge cutoffs).",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Figure 2 provides per-criterion breakdown for all three Closeness metrics and all six Quality criteria across the three configurations.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Failure modes of the baseline configuration are described qualitatively: lack of consensus, incomplete constraint adherence, and material selection bias.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Appendix G reports that BERTScore rankings contradict the LLM-based and human evaluations, with the best configuration (KG+Feedback) scoring lowest on BERTScore (50.30% vs 60.59%).",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "GPT-4o and Claude-3.5-Sonnet are used without specific snapshot dates; model version pinning is absent for the main evaluation models.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Full prompts for all agents (HGA, Critics, Summarizer, Evaluation Agent) are provided in Appendices C and D with actual example inputs.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No temperature, top-p, or other generation hyperparameters are reported for any of the models used.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The multi-agent architecture (HGA → CAs → SA → EA) and iteration logic (up to 5 cycles, early stopping on unanimous consensus) are described in detail including Algorithm 1.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": false,
    262           "justification": "Dataset extraction is described as 'manually extracted' by experts but no extraction guidelines, inter-annotator agreement, or selection criteria for which journals/papers to include are documented.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "The GitHub repository claims to include the MATDESIGN dataset with goals, constraints, and ground truth materials/methods.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Data collection is described: 50 papers from January 2024 in named journals (Nature, Nature Communications, Progress in Organic Coatings), with structured extraction of four field types by materials science experts.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "NA — the dataset uses journal papers, not recruited participants.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "No full pipeline from PDF journal papers to structured dataset entries is documented; the extraction process relies on expert judgment without described protocols or tooling.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "Training cutoffs are stated in footnote 2: LLaMA-3.1-70B (December 2023), GPT-4o (October 2023).",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "The paper explicitly addresses contamination by constructing the benchmark from January 2024 papers to ensure ground truth lies beyond all models' training cutoffs.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "The temporal separation strategy (post-cutoff papers) directly addresses benchmark contamination; this is a central design motivation for MATDESIGN.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "NA — expert annotators reviewing AI outputs is not a human subjects study requiring pre-registration.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "NA — the human evaluation involves expert domain review, not human subjects research.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "NA — evaluators are described only as 'four PhD students in Materials Science'; no demographics are required for this evaluation context.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "NA — expert annotator selection criteria are not applicable human subjects research criteria.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "NA — no experimental randomization applies to expert evaluation.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "NA — expert evaluation of AI outputs does not constitute human subjects research.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "NA — no participant attrition applies to this evaluation context.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "The paper mentions restricting to 20 hypotheses 'to restrict the cost per instance' but provides no actual cost figures in dollars, tokens, or API calls.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total computational budget is stated; the acknowledgment mentions access to 'ChatGPT enterprise version' but gives no cost or compute figures.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "The ACCELMAT configuration with Knowledge Graph and Feedback achieves the best Closeness (80%) and Quality (89%) scores on MATDESIGN.",
    375       "evidence": "Figure 2 reports configuration scores; Closeness: 70% (no FB) → 73.33% (FB) → 80% (FB+KG); Quality: 79.67% → 85.67% → 89%.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "MATDESIGN avoids training data contamination because all papers are from January 2024, beyond all evaluated LLMs' training cutoffs.",
    380       "evidence": "Footnote 2 cites LLaMA-3.1-70B cutoff (Dec 2023) and GPT-4o cutoff (Oct 2023); the dataset design is explicitly motivated by this temporal separation.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Human expert evaluations of the 42 suggestions parallel the automated LLM-based evaluation results.",
    385       "evidence": "Section 7.1 states that human evaluations 'paralleled the automated results' with the same ranking of configurations, but provides no quantitative alignment metrics.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Adding critic feedback increases the average number of hypotheses reaching consensus from 11/20 to 18/20, and adding knowledge graphs further to 19/20.",
    390       "evidence": "Table 5 reports these figures for the three configurations.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "BERTScore is inadequate for evaluating hypothesis quality in this domain because it produces rankings inconsistent with LLM-based and human evaluations.",
    395       "evidence": "Appendix G shows KG+Feedback scores lowest on BERTScore (50.30%) but highest on LLM-based and human evaluation, creating a rank reversal.",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "methodology_tags": [
    400     "benchmark-eval",
    401     "case-study"
    402   ],
    403   "key_findings": "ACCELMAT, a multi-LLM agent framework using GPT-4o for hypothesis generation with three LLM critics (GPT-4o, Claude-3.5-Sonnet, Gemini-1.5-Flash) and iterative feedback, outperforms a no-feedback baseline on the 50-paper MATDESIGN benchmark (80% vs 70% Closeness, 89% vs 79.67% Quality when augmented with the MatKG knowledge graph). The MATDESIGN benchmark is specifically constructed from January 2024 papers to avoid training data contamination. Human expert evaluation on 42 suggestions from four PhD students in Materials Science confirms the same performance ordering as the automated LLM-based evaluation metric. BERTScore produces rankings inconsistent with both LLM-based and human evaluations, suggesting traditional lexical similarity metrics are inadequate for hypothesis quality assessment in this domain.",
    404   "red_flags": [
    405     {
    406       "flag": "Trivially small benchmark",
    407       "detail": "Only 50 test instances drawn from a single month (January 2024) of publications; percentage differences of 3-10% across configurations are not statistically tested and may not be meaningful at this sample size."
    408     },
    409     {
    410       "flag": "LLM-as-judge circularity",
    411       "detail": "OpenAI o1-preview evaluates hypotheses generated primarily by GPT-4o; all systems are from the same provider ecosystem, introducing potential systematic bias in quality ratings."
    412     },
    413     {
    414       "flag": "No external system comparison",
    415       "detail": "Table 1 compares ACCELMAT feature-by-feature with LLMatDesign, ChemReasoner, and SciAgents, but provides no actual performance comparison against these systems on any common benchmark."
    416     },
    417     {
    418       "flag": "BERTScore rank reversal unexplained",
    419       "detail": "The best-performing configuration (KG+Feedback) scores lowest on BERTScore (50.30% vs 60.59% for feedback-only), which is dismissed as a metric limitation rather than investigated as a potential validity concern for the LLM-based metric."
    420     },
    421     {
    422       "flag": "No variance or multi-run evaluation",
    423       "detail": "All results are single-run point estimates; LLM temperature and stochasticity mean results could vary substantially across runs, but no variance is measured or reported."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "LLMatDesign: Autonomous materials discovery with large language models",
    429       "relevance": "Direct predecessor work on LLM-based autonomous materials design; compared in Table 1."
    430     },
    431     {
    432       "title": "ChemReasoner: Heuristic search over a large language model's knowledge space using quantum-chemical feedback",
    433       "relevance": "Related LLM agent framework for chemistry hypothesis generation; compared in Table 1."
    434     },
    435     {
    436       "title": "SciAgents: Automating scientific discovery through multi-agent intelligent graph reasoning",
    437       "relevance": "Related multi-agent framework for scientific discovery; compared in Table 1."
    438     },
    439     {
    440       "title": "MaScQA: A question answering dataset for investigating materials science knowledge of large language models",
    441       "relevance": "Existing materials science benchmark compared against MATDESIGN in Table 2."
    442     },
    443     {
    444       "title": "What can large language models do in chemistry? A comprehensive benchmark on eight tasks",
    445       "relevance": "Chemistry benchmark (ChemLLMBench) compared against MATDESIGN in Table 2."
    446     },
    447     {
    448       "title": "MatKG: An autonomously generated knowledge graph in material science",
    449       "relevance": "The external knowledge graph integrated into the best-performing ACCELMAT configuration.",
    450       "source": "haiku"
    451     },
    452     {
    453       "title": "Scaling deep learning for materials discovery",
    454       "relevance": "Foundational ML-for-materials paper (Nature, Merchant et al. 2023) providing context for the field's progress.",
    455       "source": "haiku"
    456     },
    457     {
    458       "title": "BERTScore: Evaluating text generation with BERT",
    459       "relevance": "Traditional evaluation metric tested as alternative to LLM-based evaluation in Appendix G.",
    460       "source": "haiku"
    461     }
    462   ],
    463   "engagement_factors": {
    464     "practical_relevance": {
    465       "score": 2,
    466       "justification": "Materials scientists could use the ACCELMAT pipeline as a brainstorming tool, and the code/data are released, but the 50-paper benchmark and LLM-only validation limit immediate practical deployment."
    467     },
    468     "surprise_contrarian": {
    469       "score": 1,
    470       "justification": "The BERTScore rank reversal is mildly surprising, but the overall finding that iterative LLM feedback improves outputs is expected."
    471     },
    472     "fear_safety": {
    473       "score": 0,
    474       "justification": "No AI safety or risk concerns are raised; this is a constructive application paper."
    475     },
    476     "drama_conflict": {
    477       "score": 0,
    478       "justification": "No controversy or competing claims with other groups."
    479     },
    480     "demo_ability": {
    481       "score": 2,
    482       "justification": "Code and dataset are on GitHub, and the prompts are fully documented, so a practitioner with API access could reproduce the system."
    483     },
    484     "brand_recognition": {
    485       "score": 1,
    486       "justification": "Arizona State University is a mid-tier research institution for NLP; NAACL is a respected venue but not top-tier for this domain."
    487     }
    488   },
    489   "hn_data": {
    490     "threads": [],
    491     "top_points": 0,
    492     "total_points": 0,
    493     "total_comments": 0
    494   }
    495 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs