scan.json (29366B)
1 { 2 "paper": { 3 "title": "Efficient Knowledge Infusion via KG-LLM Alignment", 4 "authors": [ 5 "Zhouyu Jiang", 6 "Ling Zhong", 7 "Mengshu Sun", 8 "Jun Xu", 9 "Rui Sun", 10 "Hui Cai", 11 "Shuhan Luo", 12 "Zhiqiang Zhang" 13 ], 14 "year": 2024, 15 "venue": "Annual Meeting of the Association for Computational Linguistics", 16 "arxiv_id": "2406.03746", 17 "doi": "10.48550/arXiv.2406.03746" 18 }, 19 "scan_version": 3, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval"], 22 "key_findings": "The ELPF framework uses domain-specific KG construction and a three-stage KG-LLM alignment process (K-LoRA pre-learning, SFT with KG retrieval, AKGF) to improve biomedical QA generation. On CMedQA and BioASQ with 500 training samples, ELPF achieves 1.03 and 1.12 ROUGE-L improvements over vanilla SFT respectively. Ablation shows K-LoRA pre-learning is the most impactful component, while AKGF primarily improves knowledge correctness and reduces hallucinations per human evaluation. Domain KGs constructed with only ~100 annotated samples achieve >85% extraction precision.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "No repository URL, code archive, or link to released code is provided anywhere in the paper." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The experiments use two publicly available datasets: CMedQA (Cui and Han, 2020) and BioASQ (Nentidis et al., 2022). The constructed domain KGs and specific train/test splits are not released, but the base datasets are public." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "Appendix D mentions 'four A100 80GB GPUs and two V100 32GB GPUs' and HuggingFace model URLs, but no requirements.txt, Dockerfile, or detailed dependency/library version specifications are provided." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology is described algorithmically but there are no runnable instructions." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "Tables 1, 2, and 3 report only point estimates (e.g., '15.44 ROUGE-L') with no confidence intervals, error bars, or ± notation." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper claims ELPF 'outperforms existing baselines' and shows 'significant performance improvement' based solely on comparing raw numbers without any statistical significance tests (no p-values, t-tests, or bootstrap tests)." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "The paper reports absolute improvements with baseline context: '1.03 ROUGE-L improvement and a 1.03 BLEU improvement compared to the vanilla LoRA-based SFT method' (Section 4.5). Table 1 provides all baseline values for comparison." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "The paper uses 500 training and 1000 test instances per dataset, described as simulating 'a scenario with limited samples,' but provides no justification for why 500 specifically was chosen and no power analysis." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "No variance, standard deviation, or results across multiple runs are reported. All results appear to be single-run numbers." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Table 1 compares against multiple baselines: ChatGPT-3.5 (0-shot and 2-shot), LLM-base, LLM-base-SFT, LLM-CP-SFT (continual pre-trained), LLM-base-SFT(RAG), and GAP." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "Baselines include ChatGPT-3.5 (2022), GAP (Colas 2022), RAG (Lewis 2020 but still standard), and contemporaneous base models (ChatGLM2-6B, Llama-2-chat-7B). For a 2024 paper, these are reasonably current." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Table 2 presents ablation experiments removing K-LoRA, AKGF, KG retrieval, and both K-LoRA & AKGF simultaneously. Figure 2 shows corresponding human evaluation ablations." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "The paper reports ROUGE-1, ROUGE-2, ROUGE-L, and BLEU (n=4) for automated evaluation, plus five-dimensional human evaluation (fluency, relevance, viewpoint, diversity, hallucination)." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": true, 97 "justification": "Section 4.2 describes manual evaluation on 200 sampled entries, ranked across five dimensions: fluency, relevance to question, correctness of core viewpoint, diversity & completeness, and knowledge hallucination. Results shown in Figure 2." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": false, 102 "justification": "The paper creates a 500 train / 1000 test split but mentions no separate validation set. Table 6 shows β parameter comparison on BioASQ which may have been done on the test set. It is unclear whether test data was used for any hyperparameter selection decisions." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Results are broken down across two datasets (CMedQA and BioASQ), multiple metrics, and five human evaluation dimensions. Table 3 shows performance across different KG completeness levels." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 4.5 discusses where ELPF underperforms RAG on BLEU for BioASQ due to KG information loss. Section 5.3 shows sparse KGs (20%) lead to worse performance than no KG. The case study in Figure 6 compares successful and less successful outputs." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "Table 3 shows that reducing KG to 20% yields worse performance than 0% KG, demonstrating that noise in sparse KGs hurts performance. Section 4.5 notes ELPF achieves lower BLEU than RAG on BioASQ." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract claims 'our approach outperforms existing baselines' on 'two biomedical question-answering datasets' with 'a limited-sample setting.' Table 1 confirms ELPF achieves highest scores across most metrics on both datasets." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "Causal claims like 'Removing K-LoRA leads to the most significant performance drop' (Section 5.1) are supported by controlled ablation experiments in Table 2, where individual components are removed while holding others constant." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper frames ELPF as a general 'modular knowledge infusion framework' and the title 'Efficient Knowledge Infusion via KG-LLM Alignment' implies domain-general applicability, but experiments are limited to two biomedical QA datasets with two specific LLMs. While the Limitations section acknowledges 'we only conducted experiments on medical domain texts,' the framing throughout exceeds the tested scope." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper does not substantively consider alternative explanations for ELPF's improvements. For example, it does not discuss whether the gains come simply from having more training stages (three-stage pipeline vs. single-stage baselines), or from additional data exposure during pre-learning, rather than the specific KG alignment mechanism." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper claims to produce 'comprehensive, logical, and low-hallucination responses' (Section 3) and improve 'knowledge correctness,' but the primary metrics are ROUGE and BLEU which measure surface text overlap, not knowledge correctness or logical coherence. The human evaluation partially addresses this gap but the disconnect between automated metrics and claimed outcomes is not discussed." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": false, 151 "justification": "ChatGLM2-6B and Llama-2-7b-chat-hf are specified with exact HuggingFace URLs (Section 4.3), but ChatGPT-3.5 is referenced without a specific API version or snapshot date (e.g., gpt-3.5-turbo-0613). Model behavior varies across API versions." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": false, 156 "justification": "Section 3.3 provides the KG-augmented input template format ('[KG]: {gq} [Instruction]: Refer to the KG and answer the following question: {q}'), but the prompts used for KG extraction (Section 3.1), the triples-to-text pre-learning format, and the AKGF generation prompts are not fully provided." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Table 5 (Appendix D) provides detailed hyperparameters for all three stages across both datasets, including batch size, epochs, LoRA rank, LoRA target, learning rate, max input/output length, KL-div β, top-p, and temperature." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. The system is a multi-stage fine-tuning pipeline with KG retrieval, not an agentic system." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 3.1 documents the KG construction pipeline in detail: extraction, four post-processing error removal steps (format errors, hallucinated entities, invalid relations, self-loops), and entity resolution via embedding similarity. Section 4.1 describes dataset preparation (500 train, 1000 test, corpus selection)." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "A dedicated 'Limitations' section appears after the Conclusions, discussing KG quality dependency, incomplete KG detection issues, conservative AKGF strategy, and single-domain evaluation." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "The Limitations section discusses threats specific to this study: 'the ELPF method is highly dependent on the quality of the graph construction' with 'inevitably noises,' 'it is challenging to detect knowledge errors unless they conflict with known knowledge,' and the conservative AKGF strategy 'somewhat limits the optimization space.'" 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": true, 188 "justification": "The Limitations section explicitly states: 'we only conducted experiments on medical domain texts. This limitation may pose a risk to the generalized ability of our findings in other scenarios.' This bounds the scope to the medical domain." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": false, 195 "justification": "The constructed domain KGs, specific train/test splits, extraction training data, and DPO preference pairs are not released. Only the base public datasets (CMedQA, BioASQ) are available." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 4.1 describes dataset sourcing. Appendix A details the annotation process: reference schemas (CMeIE v2, BioRED), manual annotation of 100 samples from corpora, two annotators for blind labeling plus one QC inspector, inter-annotator agreement of 0.9, acceptance accuracy of 0.97." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants in the study. Data sources are standard public benchmarks (CMedQA, BioASQ). Annotators for KG construction are employed staff, not study participants." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The full pipeline is documented: corpus selection → extraction by fine-tuned LLM → four post-processing steps → entity resolution → KG construction (Section 3.1). Table 4 provides statistics (subjects, triples, precision). Dataset splitting is described in Section 4.1." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding source, grant numbers, or acknowledgments section is present in the paper. All authors are affiliated with Ant Group, a major fintech company." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "All eight authors are listed with Ant Group affiliation and institutional email addresses on the first page." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "All authors are employees of Ant Group. As a technology company that could deploy such methods in production, Ant Group has a commercial interest in demonstrating the effectiveness of their knowledge infusion approach. No funding independence statement is provided." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial interests declaration is present in the paper." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": true, 238 "answer": false, 239 "justification": "No training data cutoff dates are stated for ChatGLM2-6B, Llama-2-chat-7B, or ChatGPT-3.5. CMedQA and BioASQ are public datasets that may have been in the pre-training corpora." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": true, 243 "answer": false, 244 "justification": "No discussion of whether CMedQA or BioASQ data appeared in the pre-training data of ChatGLM2-6B, Llama-2, or GPT-3.5. The zero-shot baselines are particularly vulnerable to this confound." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": true, 248 "answer": false, 249 "justification": "CMedQA and BioASQ were published before the training cutoffs of all models used. No contamination analysis or decontamination steps are discussed." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in the study. The human evaluators rank model outputs but are not study subjects." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants. The study evaluates LLM fine-tuning methods on public datasets." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in the study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in the study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in the study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in the study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in the study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No inference cost, latency, or per-example cost is reported despite the method involving multi-stage processing (KG retrieval + LLM generation)." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "Appendix D mentions 'four A100 80GB GPUs and two V100 32GB GPUs' but does not report total GPU hours, training time, or computational cost for any stage." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "No results across multiple random seeds are reported. All results appear to be from single runs despite random train/test splitting." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": false, 310 "justification": "The number of experimental runs is never stated. Results are presented without indicating how many runs produced them." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "Table 6 compares three β values for DPO, but no overall hyperparameter search budget is reported. The selection process for other parameters (learning rate, LoRA rank, epochs) is not described." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": false, 320 "justification": "Table 6 shows β parameter comparison for BioASQ only, but it is unclear whether this selection was done on test or validation data (no validation set is mentioned). Other hyperparameter selections are not justified." 321 }, 322 "multiple_comparison_correction": { 323 "applies": false, 324 "answer": false, 325 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors compare their ELPF system against baselines without acknowledging the systematic bias of evaluating their own system. No independent evaluation is conducted." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "ELPF requires three stages of training (K-LoRA, SFT, DPO) plus KG construction, substantially more compute than single-stage baselines. This compute disparity is never discussed or controlled for." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "ROUGE and BLEU are used to evaluate biomedical QA quality without discussing whether these surface overlap metrics validly measure knowledge correctness or response quality in the medical domain." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No agentic scaffolding is used. The system is a fine-tuning pipeline, not an agentic architecture." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the pre-trained models' training data temporally overlaps with CMedQA or BioASQ benchmark data." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether the evaluation setup leaks information. For CMedQA, the non-selected QA pairs are used as KG construction corpus, meaning the KG may encode answer patterns from the same distribution." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "For CMedQA, the corpus for KG construction comes from the same QA dataset (non-selected pairs). The structural similarity between training corpus and test data is not addressed." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No concrete leakage detection or prevention method (canary strings, membership inference, decontamination) is applied." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "ELPF outperforms existing baselines on both CMedQA and BioASQ datasets in limited-sample settings", 374 "evidence": "Table 1 shows ELPF achieves highest ROUGE-1 (19.83), ROUGE-2 (3.86), ROUGE-L (15.44), and BLEU (3.46) on CMedQA, and highest ROUGE-1 (28.55), ROUGE-2 (12.70) on BioASQ. Improvements of 1.03 ROUGE-L on CMedQA and 1.12 ROUGE-L on BioASQ over vanilla SFT (Section 4.5).", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "K-LoRA pre-learning is the most impactful component, enabling the model to better utilize KG information", 379 "evidence": "Table 2 ablation shows removing K-LoRA causes the largest performance drop (e.g., CMedQA ROUGE-L drops from 15.44 to 15.05 without K-LoRA, to 14.02 without both K-LoRA and AKGF). Figure 3 shows lower initial loss and faster convergence with K-LoRA. Figure 4 case study shows K-LoRA model better incorporates KG information.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "AKGF improves knowledge correctness and reduces hallucinations as measured by human evaluation", 384 "evidence": "Figure 2 shows ELPF (with AKGF) achieves the best (lowest) ranking scores on viewpoint correctness, diversity, and hallucination dimensions compared to ablated versions without AKGF.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "Domain KGs can be efficiently constructed with >85% precision using only ~100 annotated samples", 389 "evidence": "Table 4 reports precision of 0.85 on CMedQA (25,963 subjects, 220,111 triples) and 0.89 on BioASQ (20,922 subjects, 53,209 triples). Quality assessed on 200 samples of extracted results. Inter-annotator agreement was 0.9 (Appendix A).", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "ELPF significantly outperforms continual pre-training for domain knowledge infusion", 394 "evidence": "Table 1 shows ELPF vs LLM-CP-SFT: on CMedQA, ROUGE-L 15.44 vs 14.71 (+0.73); on BioASQ, ROUGE-L 24.21 vs 23.55 (+0.66). The word 'significant' is used in the paper but no significance tests are performed.", 395 "supported": "weak" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "No statistical significance testing", 401 "detail": "All performance comparisons are based on point estimates with no significance tests, confidence intervals, or error bars. The paper uses language like 'significant improvement' and 'outperforms' without any statistical support. Improvements are often small (e.g., 0.66-1.12 ROUGE-L points) and could be within noise range." 402 }, 403 { 404 "flag": "No variance or multiple-run reporting", 405 "detail": "All results appear to be single-run numbers. Given that the method involves random train/test splitting (500/1000 from larger datasets), LoRA training, and DPO, results could vary substantially across runs." 406 }, 407 { 408 "flag": "Corporate conflict of interest unacknowledged", 409 "detail": "All eight authors are Ant Group employees. No funding disclosure, competing interests statement, or acknowledgment of potential commercial interest in demonstrating effective knowledge infusion methods." 410 }, 411 { 412 "flag": "Potential data leakage in CMedQA setup", 413 "detail": "For CMedQA, the corpus used to construct the KG comes from 'the answer texts from the non-selected QA pairs' — i.e., from the same dataset distribution. This creates a potential leakage path where the KG encodes patterns from the same source as the test data." 414 }, 415 { 416 "flag": "No code or constructed KGs released", 417 "detail": "Despite proposing a full framework (ELPF), no code, constructed knowledge graphs, trained models, or specific data splits are released, making independent verification impossible." 418 }, 419 { 420 "flag": "Compute disparity with baselines not addressed", 421 "detail": "ELPF requires KG construction, K-LoRA pre-learning, supervised fine-tuning, AND DPO alignment — substantially more compute than single-stage baselines. This disparity is never controlled for or discussed." 422 } 423 ], 424 "cited_papers": [ 425 { 426 "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks", 427 "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"], 428 "year": 2020, 429 "relevance": "Foundational RAG method used as a baseline; key reference for retrieval-augmented LLM approaches." 430 }, 431 { 432 "title": "LoRA: Low-rank adaptation of large language models", 433 "authors": ["Edward J Hu", "Yelong Shen", "Phillip Wallis"], 434 "year": 2022, 435 "relevance": "Core parameter-efficient fine-tuning method used throughout the ELPF framework for all training stages." 436 }, 437 { 438 "title": "Direct preference optimization: Your language model is secretly a reward model", 439 "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"], 440 "year": 2023, 441 "relevance": "DPO training strategy used in the AKGF alignment stage; key method for preference-based LLM alignment." 442 }, 443 { 444 "title": "Training language models to follow instructions with human feedback", 445 "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"], 446 "year": 2022, 447 "relevance": "RLHF approach that inspired the AKGF feedback mechanism; foundational work on LLM alignment." 448 }, 449 { 450 "title": "Llama 2: Open foundation and fine-tuned chat models", 451 "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"], 452 "year": 2023, 453 "relevance": "Base LLM used in BioASQ experiments (Llama-2-chat-7B); major open-source model family." 454 }, 455 { 456 "title": "Unifying large language models and knowledge graphs: A roadmap", 457 "authors": ["Shirui Pan", "Linhao Luo", "Yufei Wang"], 458 "year": 2024, 459 "relevance": "Survey on LLM-KG integration; provides context for the KG-retrieval-augmented approach used in ELPF." 460 }, 461 { 462 "title": "LLMs for knowledge graph construction and reasoning: Recent capabilities and future opportunities", 463 "authors": ["Yuqi Zhu", "Xiaohan Wang", "Jing Chen"], 464 "year": 2023, 465 "relevance": "Directly relevant to the domain KG construction component of ELPF using LLM-based extraction." 466 }, 467 { 468 "title": "COMET: commonsense transformers for automatic knowledge graph construction", 469 "authors": ["Antoine Bosselut", "Hannah Rashkin", "Maarten Sap"], 470 "year": 2019, 471 "relevance": "Early work on extracting knowledge from language models to construct KGs; precursor approach to LLM-based KG construction." 472 }, 473 { 474 "title": "SKILL: Structured knowledge infusion for large language models", 475 "authors": ["Fedor Moiseev", "Zhe Dong", "Enrique Alfonseca"], 476 "year": 2022, 477 "relevance": "Prior work on structured knowledge infusion into LLMs; directly relevant baseline approach." 478 }, 479 { 480 "title": "KnowledGPT: Enhancing large language models with retrieval and storage access on knowledge bases", 481 "authors": ["Xintao Wang", "Qianwen Yang", "Yongting Qiu"], 482 "year": 2023, 483 "relevance": "Related approach for augmenting LLMs with knowledge base access; relevant to KG-retrieval methods." 484 } 485 ], 486 "engagement_factors": { 487 "practical_relevance": { 488 "score": 2, 489 "justification": "The KG-LLM alignment framework is applicable to domain-specific LLM deployment, but requires substantial setup (KG construction, multi-stage training) with no released code." 490 }, 491 "surprise_contrarian": { 492 "score": 0, 493 "justification": "Confirms expected intuitions that structured knowledge helps LLMs and that alignment improves knowledge correctness." 494 }, 495 "fear_safety": { 496 "score": 0, 497 "justification": "No safety or security concerns raised; focuses on improving domain QA quality." 498 }, 499 "drama_conflict": { 500 "score": 0, 501 "justification": "No controversy or provocative claims; standard incremental improvement paper." 502 }, 503 "demo_ability": { 504 "score": 0, 505 "justification": "No code, demo, or models released." 506 }, 507 "brand_recognition": { 508 "score": 1, 509 "justification": "Ant Group is a well-known fintech company but not a prominent AI research lab." 510 } 511 } 512 }