scan.json (23227B)
1 { 2 "paper": { 3 "title": "Beyond Textual Context: Structural Graph Encoding with Adaptive Space Alignment to alleviate the hallucination of LLMs", 4 "authors": ["Yifang Zhang", "Pengfei Duan", "Yiwen Yang", "Shengwu Xiong"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2509.22251" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "The abstract states 'Our code are available at https://github.com/yfangZhang/SSKG-LLM', providing a GitHub repository URL." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses publicly available benchmark datasets: CommonsenseQA, SIQA, and TruthfulQA. These are standard public benchmarks the authors did not modify." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions using 'a NVIDIA GeForce RTX 5880 GPU (with 48GB of VRAM)' and an AdamW optimizer with learning rate 1e-4, but provides no requirements.txt, Dockerfile, conda environment, or detailed library version list sufficient to recreate the environment." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided in the paper. A GitHub link is given but the paper itself does not include a 'Reproducing Results' section or commands to run." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "All results in Tables 1 and 2 and Figures 3-5 are reported as point estimates without confidence intervals, error bars, or ± notation." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims SSKG-LLM 'significantly outperforms' baselines (Section 4.3) but provides no statistical significance tests (no p-values, t-tests, or any other test). Differences are assessed purely by comparing raw numbers." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper mentions 'improvements across different datasets are approximately 8%' (Section 4.3) but does not provide systematic effect sizes with baseline context for each comparison. No Cohen's d, odds ratios, or other formal effect size measures are reported." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification is given for the choice of datasets or their sizes. No power analysis is discussed." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. All results appear to be single-run numbers. The robustness analysis in the appendix varies LoRA parameters but does not report variance across repeated runs." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Table 1 includes extensive baselines across four categories: base LLMs (no retrieval), RAG methods, LLMs-based KBQA (GPT-3.5+KSL, ChatGPT+ToG-R, GPT-4+ToG-R, KG-CoT), and hybrid encoding methods (KnowLA, KAPING, KG-Adapter)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include contemporary methods from 2023-2024: KG-CoT (2024), KnowLA (2024), KG-Adapter (2024), ToG-R (2024), and others. These represent the state of the art in KG-LLM integration." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Table 2 presents an ablation study systematically removing each module (KGR, KGE, KGA) to assess individual contributions. Additional ablations are provided for graph traversal strategies (Section 4.4.3) and different KG encoders (Section 4.4.3)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper uses accuracy (ACC) for CommonsenseQA and SIQA, and ROUGE-1, ROUGE-2, and BLEU-score for TruthfulQA, providing multiple metrics across tasks." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation is included. All evaluations are automated (accuracy matching and similarity metrics). For a paper claiming to reduce hallucinations and improve truthfulness, human evaluation of output quality would be relevant." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": false, 88 "justification": "The paper does not explicitly describe the separation of dev and test splits. It mentions using standard datasets but does not clarify whether results are on held-out test sets or validation sets used for any tuning decisions." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down per dataset (CommonsenseQA, SIQA, TruthfulQA) in Table 1 and per module configuration in Table 2. Additional per-encoder and per-traversal breakdowns are in Figures 3-4." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": false, 98 "justification": "No failure cases or qualitative error analysis is provided. The paper does not show examples where SSKG-LLM produces incorrect answers or discuss specific failure modes." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The ablation study reveals a negative finding: removing the KGA module while keeping KGE actually hurts performance compared to removing both (Section 4.4.2), showing that the KG encoder without adaptation exacerbates the embedding gap. This is an informative negative result." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims SSKG-LLM integrates structural and semantic information of KGs and enhances factual reasoning. Table 1 shows improvements over baselines on all three datasets, and the ablation study supports the contribution of each module. The claims are supported by the results." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims through ablation studies ('removing component X reduces performance by Y%' in Table 2 and Sections 4.4.1-4.4.3). The ablation design uses controlled single-variable manipulation, which is adequate for the causal claims made." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title and abstract make broad claims about alleviating 'the hallucination of LLMs' but testing is limited to three English QA datasets with two specific LLMs (Qwen1.5-14B and Llama2-13B). The Limitation section acknowledges the English-only constraint but the title/abstract do not bound generalization to the tested setting." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not discuss alternative explanations for its results. For instance, improvements could be partially due to additional parameters from the KGA module or the fine-tuning process rather than structural KG information specifically. No threats-to-validity or confound discussion is included." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper specifies 'Qwen1.5-14B-Chat' and 'Llama2-13B-Chat' as the foundation models (Section 4.2), along with the GraphLM encoder. These are specific enough to identify the exact model variants." 133 }, 134 "prompts_provided": { 135 "applies": false, 136 "answer": false, 137 "justification": "The paper does not use prompting in the traditional sense. The input to the LLM is constructed via an embedding pipeline (KGR -> KGE -> KGA -> concatenation with query embedding), not through text prompts. The system is fine-tuned end-to-end with LoRA." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section 4.2 reports: learning rate 1e-4, linear warm-up for 100 steps, batch size 4, AdamW optimizer. The appendix explores LoRA rank values R=8, R=16, R=32. GPU type (RTX 5880, 48GB VRAM) is also stated." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "The paper does not use agentic scaffolding. SSKG-LLM is a model architecture with modular components (KGR, KGE, KGA) that processes input in a single forward pass, not an agent with tools, retry logic, or feedback mechanisms." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 3.1 describes the KGR preprocessing pipeline: tokenization via SpaCy, stopword filtering via NLTK, keyword mapping to ConceptNet via exact string matching, TF-IDF scoring for topic identification, subgraph retrieval via Wikidata/ConceptNet APIs, and DFS serialization." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "A dedicated 'Limitation' section is present after the Conclusion (Section 5), discussing computational resource demands and the English-only constraint." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "The limitations section mentions two generic points: 'more computational resources' and 'limited to English datasets.' These are surface-level observations without specific quantification or analysis of how they affect the reported results." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what the results do NOT show. The Limitation section mentions English-only testing and computational cost but does not bound the claims to specific KG types, domains, question formats, or model scales." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "While the benchmark datasets are publicly available, the paper does not release the actual experimental outputs, model predictions, or training data constructed for fine-tuning." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 4.1 describes the datasets used (CommonsenseQA, SIQA, TruthfulQA) with citations. Section 3.1 describes how knowledge graphs are retrieved from ConceptNet and Wikidata. The data collection procedure for the benchmark evaluation is adequately documented." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. The paper uses standard benchmarks (CommonsenseQA, SIQA, TruthfulQA) which are publicly available." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The full pipeline from query to answer is documented in Section 3: tokenization -> topic extraction -> subgraph retrieval -> DFS serialization -> Levi graph conversion -> GraphLM encoding -> KGA adaptation -> concatenation -> LoRA fine-tuned LLM inference." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding source or acknowledgments section is present in the paper. There is no mention of grants, sponsors, or funding agencies." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are listed: all four authors are from Wuhan University of Technology. Correspondence emails are provided." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding statement does not establish that the work is unfunded." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper fine-tunes Qwen1.5-14B-Chat and Llama2-13B-Chat on QA benchmarks but does not state the training data cutoff dates of these base models. CommonsenseQA (2019) and SIQA (2019) could be in the pre-training data." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of potential train/test overlap between the pre-training data of Qwen1.5/Llama2 and the benchmark test sets. CommonsenseQA and SIQA are well-known benchmarks that may appear in pre-training corpora." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "CommonsenseQA (2019), SIQA (2019), and TruthfulQA (2022) were all published before the training cutoffs of Qwen1.5 and Llama2, creating contamination risk. This is not addressed in the paper." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference cost, latency, or tokens-per-example is reported. The Limitation section acknowledges the method 'demands more computational resources' but provides no quantification." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "While the GPU type (RTX 5880, 48GB VRAM) is mentioned, the total training time, GPU hours, or total computational budget is not quantified." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "SSKG-LLM consistently outperforms all baseline methods across both multiple-choice QA datasets and the TruthfulQA dataset, with improvements of approximately 8%.", 286 "evidence": "Table 1 shows SSKG-LLM (Llama2-13B) achieves 86.54% on CommonsenseQA vs 85.65% best baseline (SFT no-retrieval), 77.92% on SIQA vs 77.8% (ChatGPT+ToG-R), and 0.3462 BLEU on TruthfulQA vs 0.3391 best baseline.", 287 "supported": "weak" 288 }, 289 { 290 "claim": "The KGA module bridges the gap between KG encoder embeddings and LLM embedding spaces, and removing it leads to performance decline.", 291 "evidence": "Table 2 ablation shows removing KGA (row 1: KGR+KGE without KGA) reduces SIQA accuracy from 77.95% to 70.77%, a 7.18% drop. Section 4.4.2 discusses the embedding gap phenomenon.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "DFS traversal strategy is superior to BFS and Random Walk for KG serialization.", 296 "evidence": "Figure 3 shows DFS outperforms BFS and Random Walk across datasets, attributed to generating longer chains similar to Chain-of-Thought reasoning (Section 4.4.3).", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "GraphLM consistently delivers superior performance compared to GNN, GraphTransformer, and no-encoder baselines.", 301 "evidence": "Figure 4 shows GraphLM outperforms alternatives across datasets (Section 4.4.3).", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "SSKG-LLM is the first approach to simultaneously integrate both semantic and structural information of KGs during joint reasoning with LLMs.", 306 "evidence": "Stated in contributions (Section 1). The related work (Section 2) categorizes prior methods into Hybrid Encoding, GraphRAG, and LLMs-based KBQA, arguing each has limitations in this regard.", 307 "supported": "weak" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "SSKG-LLM integrates structural and semantic information from knowledge graphs into LLM reasoning through three modules: KGR (retrieval), KGE (encoding with GraphLM), and KGA (cross-attention adaptation). Evaluated on CommonsenseQA, SIQA, and TruthfulQA, SSKG-LLM shows modest improvements over baselines (approximately 0.7-1% over the best SFT baselines on multiple-choice tasks). Ablation studies reveal that removing the KGA module causes significant performance drops, particularly on SIQA (7.18%), and that directly injecting KG embeddings without adaptation can hurt performance compared to plain-text input. The DFS traversal strategy and GraphLM encoder are shown to be optimal choices within the framework.", 312 "red_flags": [ 313 { 314 "flag": "Overstated improvement claims", 315 "detail": "The paper claims 'approximately 8%' improvement but actual margins over the best SFT baselines are much smaller: ~0.9% on CommonsenseQA (86.54 vs 85.65), ~0.1% on SIQA (77.92 vs 77.8), and ~0.07 BLEU on TruthfulQA (0.3462 vs 0.3391). The 8% figure seems to compare against non-SFT or weaker baselines selectively." 316 }, 317 { 318 "flag": "No statistical significance testing", 319 "detail": "Given the small margins over the best baselines (often <1%), the absence of significance tests, error bars, or multi-run variance reporting makes it impossible to determine whether the claimed improvements are meaningful or within noise." 320 }, 321 { 322 "flag": "Benchmark contamination risk unaddressed", 323 "detail": "CommonsenseQA (2019) and SIQA (2019) are old enough to be in the pre-training data of Qwen1.5 and Llama2. The fine-tuning process could be overfitting to memorized patterns rather than demonstrating genuine structural reasoning improvement." 324 }, 325 { 326 "flag": "Incomplete baseline comparisons", 327 "detail": "Many baseline entries in Table 1 have missing values (dashes) across datasets. GPT-3.5+KSL has results only for CommonsenseQA, KG-CoT only for SIQA, KG-Adapter only for CommonsenseQA. This makes cross-method comparison difficult and potentially misleading." 328 }, 329 { 330 "flag": "No error bars on single-run results", 331 "detail": "Results appear to be from single runs with no repeated experiments. For the small margins reported, variance across random seeds or data ordering could easily explain the differences." 332 } 333 ], 334 "cited_papers": [ 335 { 336 "title": "GPT-4 Technical Report", 337 "authors": ["Josh Achiam"], 338 "year": 2023, 339 "arxiv_id": "2303.08774", 340 "relevance": "Foundational LLM evaluated in the baseline comparisons, central to the LLM capability assessment literature." 341 }, 342 { 343 "title": "From Local to Global: A Graph RAG Approach to Query-Focused Summarization", 344 "authors": ["Darren Edge", "Ha Trinh", "Newman Cheng"], 345 "year": 2024, 346 "arxiv_id": "2404.16130", 347 "relevance": "Key GraphRAG method for integrating knowledge graphs with LLMs, directly relevant to retrieval-augmented generation approaches." 348 }, 349 { 350 "title": "LoRA: Low-Rank Adaptation of Large Language Models", 351 "authors": ["Edward J Hu"], 352 "year": 2021, 353 "arxiv_id": "2106.09685", 354 "relevance": "Parameter-efficient fine-tuning method used as the core training approach in SSKG-LLM and widely used across LLM research." 355 }, 356 { 357 "title": "Think-on-Graph: Deep and Responsible Reasoning of Large Language Model on Knowledge Graph", 358 "authors": ["Jiashuo Sun", "Chengjin Xu"], 359 "year": 2024, 360 "relevance": "LLM-based KBQA method used as a baseline, demonstrates agentic LLM reasoning over knowledge graphs." 361 }, 362 { 363 "title": "KG-CoT: Chain-of-Thought Prompting of Large Language Models over Knowledge Graphs for Knowledge-Aware Question Answering", 364 "authors": ["Ruilin Zhao", "Feng Zhao"], 365 "year": 2024, 366 "relevance": "Baseline method combining chain-of-thought reasoning with knowledge graphs, relevant to LLM reasoning enhancement." 367 }, 368 { 369 "title": "KnowLA: Enhancing Parameter-Efficient Finetuning with Knowledgeable Adaptation", 370 "authors": ["Xindi Luo", "Zequn Sun"], 371 "year": 2024, 372 "relevance": "Hybrid encoding baseline that integrates KG entity embeddings into LLMs via adaptation layers." 373 }, 374 { 375 "title": "KG-Adapter: Enabling Knowledge Graph Integration in Large Language Models through Parameter-Efficient Fine-Tuning", 376 "authors": ["Shiyu Tian", "Yangyang Luo"], 377 "year": 2024, 378 "relevance": "Parameter-level KG integration method used as a baseline, directly comparable approach to SSKG-LLM." 379 }, 380 { 381 "title": "TruthfulQA: Measuring How Models Mimic Human Falsehoods", 382 "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"], 383 "year": 2022, 384 "relevance": "Key benchmark for evaluating LLM truthfulness and hallucination, used as evaluation dataset in this paper." 385 }, 386 { 387 "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models", 388 "authors": ["Hugo Touvron", "Louis Martin"], 389 "year": 2023, 390 "arxiv_id": "2307.09288", 391 "relevance": "One of two foundation LLMs used in experiments, central to open-source LLM evaluation." 392 }, 393 { 394 "title": "Unifying Large Language Models and Knowledge Graphs: A Roadmap", 395 "authors": ["Shirui Pan", "Linhao Luo"], 396 "year": 2024, 397 "relevance": "Survey paper on LLM-KG integration approaches, provides context for the research area." 398 } 399 ] 400 }