scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27499B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "A fine-tuned large language model based molecular dynamics agent for code generation to obtain material thermodynamic parameters",
      6     "authors": [
      7       "Zhuo-Fan Shi",
      8       "Chunxiao Xin",
      9       "Tong Huo",
     10       "Yun-Tao Jiang",
     11       "Bowen Wu"
     12     ],
     13     "year": 2025,
     14     "venue": "Scientific Reports",
     15     "arxiv_id": null,
     16     "doi": "10.1038/s41598-025-92337-6"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims about improved code generation capabilities and 42.22% time reduction are supported by results in Figures 4a-c showing time savings and expert satisfaction with MDAgent.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Paper claims MDAgent 'reduces task time' causally, but uses within-subjects design with unspecified number of experts, no mention of randomization order, no statistical significance testing, and vague baseline description.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Title promises 'material thermodynamic parameters' generality, but evaluation is limited to 4 specific LAMMPS tasks. Paper claims scalability to VASP and other software (Future Work) but provides no evidence of generalization beyond LAMMPS.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Paper does not discuss alternative explanations for time savings (e.g., interface design, expert familiarity with tool, task complexity selection bias) or competing agent designs.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Measured outcomes (task time, expert ratings, code quality scores) align with claims about efficiency and code generation capability; no conflation between proxy and target measures.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No dedicated Limitations section. Constraints are mentioned in Discussion (semi-automated nature, small parameter LLMs) but lack systematic, detailed limitations statement.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Discussion mentions MDAgent is 'semi-automated' but does not discuss specific threats: expert selection bias, unspecified sample size, limited task diversity (4 tasks), or generalization risks beyond LAMMPS.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Scope boundaries not explicitly stated. Paper does not say 'we do NOT show generalization to [other software]' or 'we do NOT evaluate [other domains]'—claims are vague about boundaries.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgments disclose: 'supported by National Key Laboratory of Data Space Technology and System' and prior grant mentioned in text.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All five authors list institutional affiliations (Peking University, Chinese Academy of Sciences, etc.) with footnotes 1-4.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Funder is National Key Lab (government/neutral institution), not the product vendor or company with financial stake in MDAgent adoption.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Declarations section states: 'The authors declare no competing interests.' Direct financial interest statement present.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "Key terms used without precise definitions: 'agent' (context-dependent), 'fine-tuning' (technical term assumed known), 'thermodynamic parameters' (assumed domain knowledge). 'MDAgent' is described architecturally but not formally defined upfront.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Main contributions explicitly stated in Introduction: (1) MDAgent framework for text-to-code generation, (2) LSCF-Dataset for fine-tuning, (3) LEQS-Dataset for evaluation. Contribution as tool + datasets is clear.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "Related work (ChemLLM, MatterGen, ChemCrow, HoneyComb, ChatMOF) is listed in Introduction, but engagement is superficial: brief descriptions with no detailed comparison of how MDAgent differs from or builds on prior work (e.g., HoneyComb also targets materials science agents but no comparison).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Code and datasets explicitly stated as 'publicly available at https://github.com/FredericVAN/PKU_MDAgent' per Data Availability statement. GitHub release confirmed.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Both LSCF-Dataset and LEQS-Dataset are stated as publicly available via the same GitHub repository. Datasets are released.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Methods mention 'QLoRA' and 'Unsloth framework' but provide no requirements.txt, Dockerfile, Python version, GPU specs, or dependency list. Environment setup is not reproducible from the paper.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Paper provides system architecture and dataset descriptions but no step-by-step instructions to install, fine-tune, or run MDAgent from scratch. GitHub repo is referenced but paper contains no instructions.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Figures 4a-f show bars/points for task time, expert ratings, and evaluation scores, but no error bars, confidence intervals, or variance bands visible. Variance completely absent.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "Comparisons between MDAgent vs. manual, fine-tuned vs. non-fine-tuned models are made, but no t-tests, p-values, or statistical significance tests reported.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "42.22% time reduction is reported, but for other comparisons (code quality, evaluation accuracy), no effect sizes—only point estimates are shown without context (e.g., Cohen's d, percentage improvement over baseline).",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Number of expert participants is never specified ('multiple experts'). Dataset sizes (167 LSCF scripts, LEQS quadruples) are stated but not justified via power analysis or prior work.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No standard deviations, variances, or ranges reported for task times, expert ratings, or evaluation scores. Results presented as point estimates only (Figure 4a-f show means/medians without spread).",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Paper compares MDAgent vs. 'traditional manual methods based on human expertise' and fine-tuned models vs. general models (ChatGPT, Qwen, ChatGLM). Baselines are present.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines (human expert manual work, general LLMs like ChatGPT/Qwen) are contemporary and relevant as of 2025. No suspiciously outdated models compared.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "No ablation study. Paper does not test MDAgent without Manager, Planner, Evaluator, or fine-tuning separately. Cannot determine which components drive time savings.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple metrics used: task completion time (Figure 4a), expert satisfaction (4b), code quality scores (4c), evaluator accuracy (MAE/MSE in 4e-f). Four separate evaluation dimensions.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Expert materials scientists evaluated LAMMPS script outputs for correctness, rated task completion usability, and scored evaluator predictions. Human evaluation of system outputs present.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Methods state: 'A random subset of the LEQS-Dataset will be used for fine-tuning... with a separate random subset designated for testing to ensure no overlap.' Train/test split enforced.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Four thermodynamic tasks (heat capacity, lattice constant, melting point, thermal expansion) are evaluated separately in Figure 3. Results broken down by task type.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "Paper notes evaluator 'is not yet ideal in terms of performance metrics' but provides no specific failure cases, error examples, or analysis of where MDAgent breaks down.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": false,
    230           "justification": "Paper acknowledges evaluator limitations ('not yet ideal') and semi-automated nature, but does not report comprehensive negative findings (e.g., tasks where MDAgent failed, low-accuracy clusters).",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Paper mentions 'ChatGPT, Qwen, ChatGLM' and 'open-source large models' as baselines/fine-tuning bases, but no model versions, sizes, snapshot dates, or exact checkpoint identifiers provided.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "No actual prompts or system instructions shown. Paper describes agent components architecturally (Manager, Planner, Worker) but does not include the text prompts used to instruct the models.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "Methods mention 'QLoRA' and 'Unsloth' for fine-tuning but report no learning rate, batch size, epochs, temperature, top-p, or other hyperparameters used in training.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Agent scaffolding is detailed in Methods: Manager (task coordination), Planner (task decomposition), Workers (code generation), Evaluators (feedback loop), memory module, UI. Architecture well-described.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "LSCF-Dataset preprocessing documented: 'screened code, removing erroneous code... annotated every script and divided into three main parts [initialization, modeling, computation]... converted to Alpaca format.' LEQS dataset construction via multi-stage expert rubric also documented.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "GitHub repository contains code and datasets. Data availability statement confirms 'publicly available at https://github.com/FredericVAN/PKU_MDAgent'. Raw data (scripts, annotations) accessible.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "LSCF-Dataset collection: 'gathered case code from official documentation, published papers, and open-source projects' (1:2:2 ratio). LEQS-Dataset: 'senior materials scientists designed tasks' and experts scored outputs. Collection methods are described.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "User study mentions 'multiple experts in materials science' but omits: number of experts, recruitment strategy, selection criteria, compensation. Expert identity and recruitment process are opaque.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "LSCF pipeline: collection → screening → annotation → Alpaca conversion → fine-tuning. LEQS pipeline: task design → LLM generation → expert scoring → fine-tuning/testing split. Full pipelines described.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "Not evaluating pre-trained models on pre-existing benchmarks—custom datasets (LSCF, LEQS) are author-constructed, so training cutoff irrelevant. NA.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "Methods explicitly state: 'A random subset of the LEQS-Dataset will be used for fine-tuning... with a separate random subset designated for testing to ensure no overlap.' Train/test separation enforced.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "Using custom author-created datasets, not public benchmarks. Benchmark contamination question is N/A for this setup.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "No pre-registration of expert user study mentioned. Study was conducted post-hoc without prior registration.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": false,
    322           "justification": "No mention of IRB approval, ethics review, or institutional review board clearance despite involving expert human participants in task evaluations.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": false,
    328           "justification": "Experts described only as 'materials science experts.' No demographics: age, gender, experience level, institution, prior familiarity with LLMs, or other participant characteristics reported.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": false,
    334           "justification": "No explicit inclusion/exclusion criteria stated. What qualifies as a 'materials science expert'? Minimum experience required? These criteria are absent.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": true,
    339           "answer": false,
    340           "justification": "No randomization of expert task order or baseline presentation order described. Data split randomization mentioned ('random subset') but not task assignment randomization.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": true,
    345           "answer": false,
    346           "justification": "No mention of blinding. Experts likely knew they were comparing MDAgent vs. manual methods, introducing potential bias. No single-blind or double-blind design described.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": true,
    351           "answer": false,
    352           "justification": "No mention of dropout, attrition, or incomplete evaluations. Unknown if any participants withdrew or failed to complete the study.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "Discussion mentions 'limitations related to... operational costs' but provides no quantitative cost data: $ per inference, latency, token count, or compute hours.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total computational budget reported for fine-tuning, evaluation, or running the full system. GPU hours, cloud costs, or training budget not disclosed.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "MDAgent reduces average task completion time by 42.22% compared to traditional manual methods",
    375       "evidence": "Figure 4a shows task elapsed time comparison between MDAgent and manual baseline",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Fine-tuned models significantly outperform non-fine-tuned large models on LAMMPS code generation",
    380       "evidence": "Figure 4c shows evaluation scores for fine-tuned LAMMPSLLM vs. other models; fine-tuned version higher",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Fine-tuning reduces the evaluator's mean absolute error and mean squared error, improving scoring accuracy",
    385       "evidence": "Figures 4e-f show MAE/MSE values decrease for fine-tuned LammpsEvaluator vs. baseline",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "MDAgent effectively assists entry-level materials scientists in completing thermodynamic simulation tasks",
    390       "evidence": "Figure 4b shows expert satisfaction ratings; Discussion notes 'excellent capabilities in script code generation'",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "The LSCF and LEQS datasets address the scarcity of domain-specific LAMMPS training data",
    395       "evidence": "Paper introduces two custom datasets (167 LSCF scripts, LEQS quadruples) but does not quantify the extent of public data scarcity",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "MDAgent can be extended to other computational materials science tasks (e.g., VASP)",
    400       "evidence": "Discussion states 'extending MDAgent methodology to first-principles calculation tasks' as future work; claimed but not demonstrated",
    401       "supported": "unsupported"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "empirical",
    406     "case-study",
    407     "benchmark-eval"
    408   ],
    409   "key_findings": "The paper introduces MDAgent, an LLM-based agent framework for automating LAMMPS code generation in materials science, reducing task completion time by 42.22% relative to manual methods. Two custom datasets (LSCF-Dataset for fine-tuning, LEQS-Dataset for evaluation) were created to address scarcity of domain-specific training data. Expert evaluation confirms that fine-tuned models outperform general large language models on LAMMPS script generation tasks, though the evaluator component exhibits only modest agreement with human expert scores. The system is presented as semi-automated, requiring human oversight due to current LLM limitations.",
    410   "red_flags": [
    411     {
    412       "flag": "No statistical significance testing",
    413       "detail": "Time comparisons and evaluation metrics lack p-values, confidence intervals, or significance tests. Cannot determine if 42.22% improvement is statistically robust or within noise."
    414     },
    415     {
    416       "flag": "Unspecified expert sample size and recruitment",
    417       "detail": "User study references 'multiple experts' without stating exact number, recruitment method, selection criteria, or demographics. Small, non-representative sample likely."
    418     },
    419     {
    420       "flag": "Missing IRB/ethics approval",
    421       "detail": "Human subject study with expert evaluators lacks mention of institutional review board approval or ethical clearance, despite involving human participants."
    422     },
    423     {
    424       "flag": "Very limited evaluation scope",
    425       "detail": "Only 4 thermodynamic tasks tested (heat capacity, lattice constant, melting point, expansion coefficient). Generalization to broader LAMMPS applications unvalidated."
    426     },
    427     {
    428       "flag": "No ablation study",
    429       "detail": "Cannot isolate contributions of Manager, Planner, Worker, Evaluator, or fine-tuning. System is evaluated as black box; component importance unknown."
    430     },
    431     {
    432       "flag": "Evaluator accuracy concerns",
    433       "detail": "Figure 4d shows LammpsEvaluator frequently disagrees with human expert scores. Fine-tuning reduces error but agreement is incomplete. Evaluator cannot reliably replace human judgment."
    434     },
    435     {
    436       "flag": "Overgeneralized title and abstract",
    437       "detail": "Title promises 'material thermodynamic parameters' but only LAMMPS is tested. Claims of scalability to other software (VASP) are future work, not demonstrated."
    438     },
    439     {
    440       "flag": "Incomplete reproducibility documentation",
    441       "detail": "While GitHub repo is available, paper lacks step-by-step instructions, environment specifications (requirements.txt, GPU specs, Python version), or hyperparameter details for reproduction."
    442     },
    443     {
    444       "flag": "No alternative baseline comparisons",
    445       "detail": "Compared only against general LLMs (ChatGPT, Qwen) and manual methods. No comparison with other specialized code-generation systems (e.g., Copilot, CodeLLaMA fine-tuned variants) or domain-specific agents (e.g., competing materials science AI systems)."
    446     },
    447     {
    448       "flag": "Vague baseline description",
    449       "detail": "'Traditional manual methods based on human expertise' is undefined. What exactly is the manual baseline? Is it expert-optimal code? Novice code? No control condition clarity."
    450     }
    451   ],
    452   "cited_papers": [
    453     {
    454       "title": "Understanding Molecular Simulation: From Algorithms to Applications",
    455       "authors": "Frenkel, D. & Smit, B.",
    456       "year": 2023,
    457       "relevance": "Foundational molecular dynamics theory and algorithms underlying LAMMPS simulations."
    458     },
    459     {
    460       "title": "ChemLLM: A Chemical Large Language Model",
    461       "authors": "Zhang, D. et al.",
    462       "year": 2024,
    463       "relevance": "Related work on domain-specific fine-tuning of LLMs for chemistry; similar methodology for specialized domains."
    464     },
    465     {
    466       "title": "Unleashing the power of AI in science—key considerations for materials data preparation",
    467       "authors": "Lu, Y. et al.",
    468       "year": 2024,
    469       "relevance": "Discusses data quality and preparation challenges for AI in materials science, directly motivates dataset creation (LSCF, LEQS)."
    470     },
    471     {
    472       "title": "A survey on large language model based autonomous agents",
    473       "authors": "Wang, L. et al.",
    474       "year": 2024,
    475       "relevance": "Comprehensive survey of LLM-based agent systems; contextualizes MDAgent within broader agent design patterns."
    476     },
    477     {
    478       "title": "HoneyComb: A Flexible LLM-based Agent System for Materials Science",
    479       "authors": "Zhang, H. et al.",
    480       "year": 2024,
    481       "relevance": "Directly competing work on LLM agents for materials science; no direct comparison or differentiation in paper."
    482     },
    483     {
    484       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    485       "authors": "Shinn, N. et al.",
    486       "year": 2023,
    487       "relevance": "Incorporates reflexion principles into MDAgent evaluator feedback loop for iterative code refinement."
    488     }
    489   ],
    490   "engagement_factors": {
    491     "practical_relevance": {
    492       "score": 2,
    493       "justification": "Tool exists and is published, but applicability is narrow (LAMMPS-specific thermodynamic tasks). Unclear if materials scientists will adopt without vendor support or integration."
    494     },
    495     "surprise_contrarian": {
    496       "score": 1,
    497       "justification": "Applying LLMs to code generation in materials science is incremental; many similar agent systems exist (HoneyComb, ChemCrow). No surprising methodological or domain insight."
    498     },
    499     "fear_safety": {
    500       "score": 0,
    501       "justification": "No AI safety, alignment, or risk concerns raised. System is benign domain application with no safety implications."
    502     },
    503     "drama_conflict": {
    504       "score": 0,
    505       "justification": "No controversy, conflict, or dramatic angle. Straightforward systems paper with positive results."
    506     },
    507     "demo_ability": {
    508       "score": 2,
    509       "justification": "Code/datasets on GitHub, but environment setup unclear (no requirements.txt). Would-be users need materials science domain knowledge to evaluate; barrier to casual exploration."
    510     },
    511     "brand_recognition": {
    512       "score": 1,
    513       "justification": "Published in Scientific Reports (reputable, but not Nature/Science). Peking University affiliation is recognized but authors are not widely known in AI/ML communities."
    514     }
    515   },
    516   "hn_data": {
    517     "threads": [],
    518     "top_points": 0,
    519     "total_points": 0,
    520     "total_comments": 0
    521   }
    522 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs