scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24519B)
      1 {
      2   "paper": {
      3     "title": "Automated Extraction of Mechanical Constitutive Models from Scientific Literature using Large Language Models: Applications in Cultural Heritage Conservation",
      4     "authors": ["Rui Hu", "Yue Wu", "Tianhao Su", "Yin Wang", "Shunbo Hu", "Jizhong Huang"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.16551"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. The web platform is described but no link or source code is provided."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The Heritage Materials Constitutive Database is described but no download link, archive, or public URL is provided. The web platform screenshots are shown but no access information is given."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, library versions, requirements files, or dependency lists are provided. The paper does not mention which specific LLM models were used (only referring generically to 'resource-efficient' and 'high-capability' models)."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided. The methodology section describes the framework conceptually but does not provide runnable instructions or scripts."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Only point estimates are reported: Precision 80.4%, Recall 83.3%, F1 81.9%. No confidence intervals or error bars are provided for any metric."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "The paper does not make comparative claims against alternative systems. It reports only its own system's performance metrics without comparing to baselines, so significance tests are not applicable."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims '90% reduction in manual curation time' but provides no baseline measurement or methodology for this estimate. The claim appears in the abstract and discussion but lacks supporting data."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The corpus of 2,000+ papers is described but no justification is given for why this sample size is adequate. The ground truth evaluation is on 113 papers (all papers passing the Gatekeeper), but no justification is given for this size."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Single-run results are reported. No variance across runs, seeds, or repeated evaluations is mentioned. Given LLM non-determinism (acknowledged in Section 2.4.4), variance across runs would be informative but is absent."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No baselines are included. The paper mentions traditional NLP pipelines and rule-based systems in the introduction as being inadequate but does not compare its system against any of them quantitatively."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No baselines are included at all, so this criterion cannot be satisfied. The paper references traditional approaches (regex, NER, rule-based) only in prose without any quantitative comparison."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The system has multiple components (Gatekeeper agent, Analyst agent, Context-Aware Symbolic Grounding, Schema-Constrained Decoding, Self-Correction) but no ablation study is conducted to measure which components contribute to performance."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper reports Precision (80.4%), Recall (83.3%), F1-Score (81.9%), and AUC (0.782), along with a full confusion matrix including TP, TN, FP, FN counts."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Domain experts in solid mechanics manually annotated the entire 113-paper set to create a ground truth dataset of 222 target constitutive model entities (Section 3.2). System outputs were evaluated against this expert ground truth."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "There is no separation of development and test sets. The ground truth was created from the same 113 papers the system processed. No held-out set was used; all evaluation is on the same data used for system development."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Figure 2 provides a distribution of extracted constitutive models by mechanical mechanism category (Elasto-Plasticity 31.9%, Failure & Damage 24.8%, Time-dependent/Rheology 12.4%, etc.). However, per-category precision/recall breakdowns are not provided."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 3.2 discusses that residual errors are 'largely attributable to the parsing challenges of non-standard tabular structures in older PDFs.' Section 5 discusses limitations with older low-resolution scanned documents and challenges with chart-to-data conversion."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No negative results are reported. The paper does not mention any approaches that were tried and failed, configurations that did not work, or design decisions that led to worse performance."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims are supported: 113 core documents from 2,000+ papers (Section 3.1), 185 model instances and 450+ parameters (Section 3.1), 80.4% precision (Section 3.2), and ~90% workload reduction (Section 5, though this last claim is weakly supported)."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper claims the framework 'reduces manual data curation time by approximately 90%' (abstract) without any controlled comparison or measurement methodology. This is a causal claim ('reduces') with no supporting study design. The paper also claims the Symbolic Grounding mechanism resolves ambiguities but provides only a single qualitative example."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims 'Applications in Cultural Heritage Conservation' broadly, and the conclusion states this 'lays the foundation for the Digital Material Twin of our built heritage.' However, the corpus was sourced exclusively from arXiv (Section 2.2.1), which is not a primary venue for heritage conservation research. This significant source limitation is not acknowledged as bounding the generalizability of results."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations for the results are discussed. For example, the 80.4% precision could be partially explained by the restrictive Gatekeeper filtering creating an easier extraction task on pre-selected relevant papers, but this is not considered."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "No specific LLM model names or versions are stated anywhere in the paper. The Gatekeeper is described as a 'resource-efficient Large Language Model' and the Analyst as having 'advanced reasoning capabilities,' but no model names (e.g., GPT-4, Claude, etc.) or version identifiers are provided."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper describes what the prompts do in natural language (e.g., 'evaluating each document against three non-negotiable dimensions') but does not provide any actual prompt text. No appendix with prompts is included."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No hyperparameters are reported — no temperature, top-p, max tokens, or any LLM API settings are mentioned. The Head-Truncation Strategy mentions '8,000 characters' but no model inference parameters are given."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The two-stage agentic scaffolding is described in detail: Gatekeeper for relevance filtering (Section 2.3), Analyst for extraction (Section 2.4), including Context-Aware Target Identification (2.4.1), Symbolic Grounding (2.4.2), Schema-Constrained Decoding (2.4.3), and Error Handling with Self-Correction (2.4.4). The pipeline workflow is documented."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 2.2 describes the data acquisition process: sourcing from arXiv with boolean search queries combining heritage and mechanics keywords, PDF parsing with a custom pipeline, and a Head-Truncation Strategy extracting ~8,000 characters for Stage I processing."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "The Discussion section (Section 5) contains substantive discussion of limitations, including sensitivity to older low-resolution scanned documents and challenges with multimodal extraction from complex scientific charts."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The limitations discussed in Section 5 are specific to the system's technical capabilities (document quality, chart extraction) but do not address threats to the validity of the evaluation itself — e.g., the lack of inter-annotator agreement for the ground truth, the single-source (arXiv) corpus bias, or the absence of held-out evaluation."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. It does not acknowledge that results are limited to arXiv papers only (not the broader heritage conservation literature), nor that the evaluation is on a single corpus without cross-validation. The discussion focuses on future improvements rather than explicit scope boundaries."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Neither the raw corpus of 2,000+ papers, the 113 filtered papers, the expert annotations (ground truth), nor the extracted database are made available for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 2.2.1 describes the data collection: papers sourced from arXiv, targeting physics and engineering categories, using boolean search combining heritage keywords with mechanics terms. The filtering pipeline (2,000+ → 113) is described."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants were recruited for a study. The domain experts who created the ground truth annotations are not described as study participants — they appear to be the research team itself. This is a benchmark evaluation paper, not a human subjects study."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is documented: 2,000+ papers ingested → PDF parsing and serialization (Section 2.2.2) → Gatekeeper filtering to 113 papers (Section 2.3) → Analyst extraction yielding 185 model instances and 450+ parameters (Section 3.1). Filtering counts at each stage are provided."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgments section states: 'This work is supported by the Advanced Materials-National Science and Technology Major Project(2025ZD0618802) and the Shanghai Technical Service Center of Science and Engineering Computing, Shanghai University.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are listed with their affiliation: Shanghai University, Shanghai, China. The corresponding author email is provided."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The funder is a national science and technology project and a university computing center. Neither has a commercial stake in the outcomes of this LLM extraction system."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It uses LLMs as extraction tools on novel domain-specific documents, not as test subjects on a known benchmark. Contamination of extraction targets in training data is not the same concern."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Same reasoning: this is an information extraction pipeline, not a benchmark evaluation of model knowledge. The LLMs are used as tools, not evaluated on memorized knowledge."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Same reasoning: no benchmark evaluation of pre-trained model capability is conducted."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants were involved in a study. The paper is a system/benchmark evaluation paper."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants were involved. IRB approval is not applicable."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants were involved in a study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants were involved in a study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants or experimental conditions involving randomization."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants or experimental conditions involving blinding."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants were involved in a study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper emphasizes computational efficiency as a design goal (the two-stage architecture is motivated by cost reduction) but never reports actual inference costs, API spend, tokens consumed, or wall-clock time for processing the 2,000+ paper corpus."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget is stated. No GPU hours, API costs, hardware specifications, or processing time are reported despite the system processing 2,000+ papers."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The system achieved extraction precision of 80.4%, recall of 83.3%, and F1-score of 81.9% on the 113-paper ground truth corpus.",
    286       "evidence": "Section 3.2 reports these metrics based on manual expert annotation of 222 ground truth constitutive model entities. Confusion matrix in Figure 3a shows TP=185, FP=45, FN=37, TN=1311.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "The framework successfully extracted 185 constitutive model instances and over 450 calibrated parameters from 113 core documents filtered from 2,000+ papers.",
    291       "evidence": "Section 3.1 reports these numbers. Figure 2 shows the distribution of extracted models by mechanism category.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "The system reduces manual data curation time by approximately 90%.",
    296       "evidence": "Stated in the abstract and discussed in Section 5. No measurement methodology, timing data, or controlled comparison is provided to support this specific figure.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "The Context-Aware Symbolic Grounding mechanism correctly resolves mathematical symbol ambiguities.",
    301       "evidence": "Section 3.3 provides a single qualitative case study (kaolinite clay rheology) showing the system correctly resolved an ambiguous table header scaling factor. No systematic quantitative evaluation of symbolic grounding accuracy is provided.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "The ROC curve yields an AUC of 0.782 with a low False Positive Rate of 3.3%.",
    306       "evidence": "Figure 3b shows the ROC curve. However, the text mentions AUC of 0.782 while the figure caption says 0.762, creating an inconsistency.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval", "case-study"],
    311   "key_findings": "The paper presents a two-stage LLM agentic framework for extracting mechanical constitutive models from scientific literature about cultural heritage materials. From a corpus of 2,000+ arXiv papers, the Gatekeeper agent filtered to 113 relevant documents, from which the Analyst agent extracted 185 constitutive model instances with 80.4% precision and 83.3% recall. A qualitative case study demonstrates the system's ability to resolve symbol ambiguities using domain context. The system is deployed as a web-based knowledge retrieval platform for heritage conservation researchers.",
    312   "red_flags": [
    313     {
    314       "flag": "No model identities disclosed",
    315       "detail": "The paper never names which LLMs are used for the Gatekeeper or Analyst agents. This is a fundamental reproducibility gap — the results cannot be replicated without knowing what models were used."
    316     },
    317     {
    318       "flag": "No baselines",
    319       "detail": "No comparison against any baseline system (rule-based, NER-based, or alternative LLM approaches). The paper criticizes traditional approaches in the introduction but never quantitatively compares against them."
    320     },
    321     {
    322       "flag": "AUC inconsistency",
    323       "detail": "The text in Section 3.2 states AUC of 0.782 while Figure 3b caption states AUC of 0.762. This discrepancy is unexplained."
    324     },
    325     {
    326       "flag": "Unsupported 90% workload reduction claim",
    327       "detail": "The claim that the system reduces manual curation time by ~90% appears in the abstract but is never measured or justified with any timing data or user study."
    328     },
    329     {
    330       "flag": "Single-source corpus bias",
    331       "detail": "All papers are sourced exclusively from arXiv, which is not a primary venue for heritage conservation or materials science research. This creates a significant corpus bias that is never acknowledged."
    332     },
    333     {
    334       "flag": "No variance across LLM runs",
    335       "detail": "The paper acknowledges LLM non-determinism (Section 2.4.4, self-correction mechanism) but reports only single-run results without any variance or stability analysis."
    336     },
    337     {
    338       "flag": "No prompts or hyperparameters disclosed",
    339       "detail": "Neither the actual prompts used for the Gatekeeper or Analyst agents nor any LLM API parameters (temperature, etc.) are provided. Combined with the missing model identities, the methodology is essentially a black box."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "14 examples of how LLMs can transform materials science and chemistry: a reflection on a large language model hackathon",
    345       "authors": ["K. M. Jablonka"],
    346       "year": 2023,
    347       "relevance": "Explores LLM applications in materials science, directly relevant to understanding LLM capabilities for scientific domain extraction."
    348     },
    349     {
    350       "title": "Scientific discovery in the age of artificial intelligence",
    351       "authors": ["H. Wang"],
    352       "year": 2023,
    353       "relevance": "Broad survey of AI capabilities for scientific discovery, relevant to understanding LLM applications in research automation."
    354     },
    355     {
    356       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    357       "authors": ["J. Wei"],
    358       "year": 2022,
    359       "relevance": "Foundational work on LLM reasoning capabilities that underlies the prompting approach used in this extraction framework."
    360     },
    361     {
    362       "title": "Language models are few-shot learners",
    363       "authors": ["T. Brown"],
    364       "year": 2020,
    365       "relevance": "Foundational GPT-3 paper establishing few-shot learning capabilities used in LLM-based extraction tasks."
    366     },
    367     {
    368       "title": "Large language models are zero-shot reasoners",
    369       "authors": ["T. Kojima"],
    370       "year": 2022,
    371       "relevance": "Establishes zero-shot reasoning capability of LLMs, relevant to the extraction framework's use of LLMs without task-specific training."
    372     },
    373     {
    374       "title": "Structured information extraction from complex scientific text with fine-tuned large language models",
    375       "authors": ["A. Dunn"],
    376       "year": 2022,
    377       "arxiv_id": "2212.05238",
    378       "relevance": "Directly relevant prior work on using LLMs for structured extraction from scientific text."
    379     },
    380     {
    381       "title": "Survey of hallucination in natural language generation",
    382       "authors": ["Z. Ji"],
    383       "year": 2023,
    384       "relevance": "Addresses LLM hallucination risks that are central to the reliability concerns of automated extraction systems."
    385     },
    386     {
    387       "title": "The rise and potential of large language model based agents: A survey",
    388       "authors": ["Z. Xi"],
    389       "year": 2025,
    390       "relevance": "Survey of LLM-based agents relevant to understanding the agentic framework design used in this paper."
    391     },
    392     {
    393       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    394       "authors": ["L. Chen"],
    395       "year": 2023,
    396       "relevance": "Addresses cost-efficiency in LLM usage, directly relevant to the paper's two-stage cost-optimization strategy."
    397     },
    398     {
    399       "title": "Efficient guided generation for large language models",
    400       "authors": ["B. T. Willard", "R. Louf"],
    401       "year": 2023,
    402       "arxiv_id": "2307.09702",
    403       "relevance": "Foundation for the schema-constrained decoding approach used in the Analyst agent."
    404     },
    405     {
    406       "title": "Nougat: Neural optical understanding for academic documents",
    407       "authors": ["L. Blecher"],
    408       "year": 2023,
    409       "arxiv_id": "2308.13418",
    410       "relevance": "PDF document parsing tool relevant to the paper's document ingestion pipeline."
    411     }
    412   ]
    413 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs