scan.json (30392B)
1 { 2 "paper": { 3 "title": "HyperGraphRAG: Retrieval-Augmented Generation via Hypergraph-Structured Knowledge Representation", 4 "authors": [ 5 "Haoran Luo", 6 "Haihong E", 7 "Guanting Chen", 8 "Yandan Zheng", 9 "Xiaobao Wu", 10 "Yikai Guo", 11 "Qika Lin", 12 "Yu Feng", 13 "Zemin Kuang", 14 "Meina Song", 15 "Yifan Zhu", 16 "Luu Anh Tuan" 17 ], 18 "year": 2025, 19 "venue": "NeurIPS 2025", 20 "arxiv_id": "2503.21322" 21 }, 22 "scan_version": 3, 23 "active_modules": ["experimental_rigor", "data_leakage"], 24 "methodology_tags": ["benchmark-eval", "theoretical"], 25 "key_findings": "HyperGraphRAG, a hypergraph-based RAG framework modeling n-ary relational facts via hyperedges, consistently outperforms standard RAG and binary graph-based RAG methods (GraphRAG, LightRAG, PathRAG, HippoRAG2) across five domains (medicine, agriculture, CS, legal, mix) on F1, retrieval similarity, and generation evaluation metrics. Ablation studies confirm that entity retrieval, hyperedge retrieval, and chunk retrieval fusion each contribute to performance. The paper provides information-theoretic proofs that hypergraph representation preserves more information than binary graph representation for n-ary facts. Cost analysis shows HyperGraphRAG achieves a favorable trade-off between construction/generation cost and output quality.", 26 "checklist": { 27 "artifacts": { 28 "code_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "The paper states 'Our data and code are publicly available' with a footnote linking to https://github.com/LHRLAB/HyperGraphRAG." 32 }, 33 "data_released": { 34 "applies": true, 35 "answer": true, 36 "justification": "The paper states 'Our data and code are publicly available.' The evaluation uses four domains from the publicly available UltraDomain dataset and publicly available hypertension guidelines. The constructed QA benchmark is claimed to be released." 37 }, 38 "environment_specified": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper mentions '80-core CPU and 512GB RAM' and the models used (GPT-4o-mini, text-embedding-3-small) but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions." 42 }, 43 "reproduction_instructions": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper provides a GitHub link but includes no step-by-step reproduction instructions, no README-level commands, and no 'Reproducing Results' section." 47 } 48 }, 49 "statistical_methodology": { 50 "confidence_intervals_or_error_bars": { 51 "applies": true, 52 "answer": false, 53 "justification": "All results in Tables 2 and 3 and Figures 4-7 are reported as point estimates with no confidence intervals, error bars, or ± notation." 54 }, 55 "significance_tests": { 56 "applies": true, 57 "answer": false, 58 "justification": "The paper claims HyperGraphRAG 'outperforms' all baselines but no statistical significance tests (p-values, t-tests, etc.) are reported. All comparisons are based solely on comparing point estimates." 59 }, 60 "effect_sizes_reported": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper reports improvements with baseline context, e.g., 'gains of +7.45 (F1), +7.62 (R-S), and +3.69 (G-E)' compared to StandardRAG, and per-source-type improvements ('+8.6, +8.8, and +4.4' for Binary Source)." 64 }, 65 "sample_size_justified": { 66 "applies": true, 67 "answer": false, 68 "justification": "The evaluation uses 512 questions per domain (256 binary + 256 n-ary) but no justification is given for why this sample size was chosen and no power analysis is discussed." 69 }, 70 "variance_reported": { 71 "applies": true, 72 "answer": false, 73 "justification": "No standard deviations, variance across runs, or interquartile ranges are reported. Results appear to be from single runs with no spread measures." 74 } 75 }, 76 "evaluation_design": { 77 "baselines_included": { 78 "applies": true, 79 "answer": true, 80 "justification": "Six baselines are compared: NaiveGeneration, StandardRAG, GraphRAG, LightRAG, PathRAG, and HippoRAG2 (Table 2)." 81 }, 82 "baselines_contemporary": { 83 "applies": true, 84 "answer": true, 85 "justification": "Baselines include recent methods: LightRAG (2024), PathRAG (2025), HippoRAG2 (2025), representing the current state of the art in graph-based RAG." 86 }, 87 "ablation_study": { 88 "applies": true, 89 "answer": true, 90 "justification": "Figure 4 shows an ablation study removing entity retrieval (w/o ER), hyperedge retrieval (w/o HR), chunk retrieval (w/o CR), and combinations thereof, measuring impact on F1, R-S, and G-E." 91 }, 92 "multiple_metrics": { 93 "applies": true, 94 "answer": true, 95 "justification": "Three evaluation metrics are used: F1 (word-level accuracy), R-S (retrieval similarity), and G-E (generation evaluation across 7 dimensions)." 96 }, 97 "human_evaluation": { 98 "applies": true, 99 "answer": false, 100 "justification": "No human evaluation of system outputs is performed. The G-E metric uses GPT-4o-mini as an LLM judge. Human annotators only verified the constructed QA benchmark, not the system's generated answers." 101 }, 102 "held_out_test_set": { 103 "applies": true, 104 "answer": false, 105 "justification": "No explicit separation of dev and test splits is described. Hyperparameters (k=60, thresholds) appear to have been tuned without a separate validation set. Figure 6(a) shows top-k analysis that may have been conducted on the test data." 106 }, 107 "per_category_breakdown": { 108 "applies": true, 109 "answer": true, 110 "justification": "Table 2 provides per-domain breakdowns (Medicine, Agriculture, CS, Legal, Mix) and per-source-type breakdowns (Binary Source, N-ary Source, Overall)." 111 }, 112 "failure_cases_discussed": { 113 "applies": true, 114 "answer": false, 115 "justification": "No failure cases are discussed. The case study in Appendix H shows only a success case where HyperGraphRAG gives the correct answer while baselines fail. No error analysis or discussion of where the approach breaks down." 116 }, 117 "negative_results_reported": { 118 "applies": true, 119 "answer": false, 120 "justification": "Every experiment shows HyperGraphRAG outperforming all baselines. No configurations that failed, approaches tried and abandoned, or ablations that unexpectedly hurt are reported." 121 } 122 }, 123 "claims_and_evidence": { 124 "abstract_claims_supported": { 125 "applies": true, 126 "answer": true, 127 "justification": "The abstract claims HyperGraphRAG 'outperforms both standard RAG and previous graph-based RAG methods in answer accuracy, retrieval efficiency, and generation quality.' Table 2 shows it achieves the best scores on F1, R-S, and G-E across all domains." 128 }, 129 "causal_claims_justified": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper's causal claims are primarily supported through ablation studies (Figure 4) that use controlled single-variable manipulation — removing one component at a time while holding others constant. This adequately supports claims like 'entity retrieval is critical for precise retrieval.'" 133 }, 134 "generalization_bounded": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper tests only with GPT-4o-mini for extraction and generation, and text-embedding-3-small for embeddings, but the abstract and conclusions make broad claims about HyperGraphRAG's effectiveness without bounding them to this specific LLM configuration. The title 'Retrieval-Augmented Generation via Hypergraph-Structured Knowledge Representation' is broader than what was tested." 138 }, 139 "alternative_explanations_discussed": { 140 "applies": true, 141 "answer": false, 142 "justification": "No alternative explanations are discussed. The paper does not consider confounds such as whether the improvements come from the richer knowledge extraction (more entities/hyperedges) rather than the hypergraph structure itself, or whether GPT-4o-mini's behavior might differ from other LLMs." 143 }, 144 "proxy_outcome_distinction": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper uses 'retrieval efficiency' to describe the R-S metric, but R-S measures semantic similarity between retrieved and ground-truth knowledge — not efficiency (speed/resource usage). The G-E metric combines LLM-judged quality with F1 (Equation 45), conflating automated and subjective measures. These proxy-outcome gaps are not acknowledged." 148 } 149 }, 150 "setup_transparency": { 151 "model_versions_specified": { 152 "applies": true, 153 "answer": false, 154 "justification": "The paper specifies 'GPT-4o-mini' and 'text-embedding-3-small' but provides no snapshot dates or API version identifiers. Model behavior can change across API versions." 155 }, 156 "prompts_provided": { 157 "applies": true, 158 "answer": true, 159 "justification": "Full prompt text is provided in the appendix: Figure 8 (n-ary relation extraction prompt), Figure 9 (entity extraction prompt), Figure 10 (generation prompt), and Figure 11-12 (evaluation prompts)." 160 }, 161 "hyperparameters_reported": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 5.1 and Table 4 report: entity retrieval kV=60, τV=50; hyperedge retrieval kH=60, τH=5; chunk retrieval kC=5, τC=0.5; temperature=1.0; max generation length=32k tokens; 16 parallel cores." 165 }, 166 "scaffolding_described": { 167 "applies": false, 168 "answer": false, 169 "justification": "HyperGraphRAG is a pipeline (extraction → storage → retrieval → generation) without agentic scaffolding — no tool use, retry logic, feedback loops, or memory management." 170 }, 171 "data_preprocessing_documented": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 4.1 describes the knowledge extraction pipeline (n-ary relation extraction, bipartite storage, vector representation). Appendix D details question construction (512 per domain, binary/n-ary split, 1/2/3-hop sampling with specific counts: 128/64/64)." 175 } 176 }, 177 "limitations_and_scope": { 178 "limitations_section_present": { 179 "applies": true, 180 "answer": true, 181 "justification": "Appendix I is titled 'Limitations and Future Work' with five subsections (I.1-I.5) covering multimodal extension, RL integration, federated learning, foundation models, and scaling to harder tasks." 182 }, 183 "threats_to_validity_specific": { 184 "applies": true, 185 "answer": false, 186 "justification": "The limitations section discusses only future research directions (multimodal, RL, federated, foundation models, harder tasks). No specific threats to the current study's validity are discussed — e.g., single LLM dependency, LLM-as-judge bias, benchmark construction bias, lack of statistical testing." 187 }, 188 "scope_boundaries_stated": { 189 "applies": true, 190 "answer": false, 191 "justification": "The limitations section does not state what the current results do NOT show. There is no explicit bounding of claims to tested models, domains, or evaluation conditions." 192 } 193 }, 194 "data_integrity": { 195 "raw_data_available": { 196 "applies": true, 197 "answer": true, 198 "justification": "The paper states 'Our data and code are publicly available' with a GitHub link. The knowledge sources (UltraDomain, hypertension guidelines) are public, and the constructed QA benchmarks are claimed to be released." 199 }, 200 "data_collection_described": { 201 "applies": true, 202 "answer": true, 203 "justification": "Appendix D describes data collection: knowledge from UltraDomain (4 domains) and hypertension guidelines (medicine), question sampling via 1/2/3-hop traversal with specific counts, GPT-generated questions with human verification." 204 }, 205 "recruitment_methods_described": { 206 "applies": false, 207 "answer": false, 208 "justification": "No human participants were recruited. Data sources are standard datasets (UltraDomain) and public medical guidelines. Human annotators verified QA pairs but are not study participants." 209 }, 210 "data_pipeline_documented": { 211 "applies": true, 212 "answer": true, 213 "justification": "The pipeline is documented: raw documents → n-ary relation extraction (Algorithm 1) → bipartite graph storage → vector embedding → retrieval and generation (Algorithm 2). Question construction pipeline is described in Appendix D with counts per hop level." 214 } 215 }, 216 "conflicts_of_interest": { 217 "funding_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "The Acknowledgments section lists NSFC grants (No. 62473271, 62176026, 62406036) and the Engineering Research Center of Information Networks, Ministry of Education, China." 221 }, 222 "affiliations_disclosed": { 223 "applies": true, 224 "answer": true, 225 "justification": "All author affiliations are listed: Beijing University of Posts and Telecommunications, Nanyang Technological University, Beijing Institute of Computer Technology, National University of Singapore, China Mobile Research Institute, Beijing Anzhen Hospital." 226 }, 227 "funder_independent_of_outcome": { 228 "applies": true, 229 "answer": true, 230 "justification": "Funding is from NSFC (National Natural Science Foundation of China) and the Ministry of Education — government research bodies with no commercial stake in the results." 231 }, 232 "financial_interests_declared": { 233 "applies": true, 234 "answer": false, 235 "justification": "No competing interests or financial interests statement is included in the paper." 236 } 237 }, 238 "contamination": { 239 "training_cutoff_stated": { 240 "applies": true, 241 "answer": false, 242 "justification": "The paper uses GPT-4o-mini for extraction, generation, and evaluation but does not state the model's training data cutoff date. This is relevant because the knowledge sources (UltraDomain, medical guidelines) could be in the training data." 243 }, 244 "train_test_overlap_discussed": { 245 "applies": true, 246 "answer": false, 247 "justification": "No discussion of whether GPT-4o-mini may have seen the UltraDomain data, hypertension guidelines, or similar QA pairs during training. The NaiveGeneration baseline shows non-zero performance (12-23% F1), suggesting some knowledge overlap." 248 }, 249 "benchmark_contamination_addressed": { 250 "applies": true, 251 "answer": false, 252 "justification": "The UltraDomain dataset and international hypertension guidelines are publicly available documents that were likely in GPT-4o-mini's training data. No contamination analysis or discussion is provided." 253 } 254 }, 255 "human_studies": { 256 "pre_registered": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in the study. Human annotators verified QA pairs but were not study subjects." 260 }, 261 "irb_or_ethics_approval": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in the study." 265 }, 266 "demographics_reported": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in the study." 270 }, 271 "inclusion_exclusion_criteria": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in the study." 275 }, 276 "randomization_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in the study." 280 }, 281 "blinding_described": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in the study." 285 }, 286 "attrition_reported": { 287 "applies": false, 288 "answer": false, 289 "justification": "No human participants in the study." 290 } 291 }, 292 "cost_and_practicality": { 293 "inference_cost_reported": { 294 "applies": true, 295 "answer": true, 296 "justification": "Table 3 reports time per query (TPQ = 0.256s) and cost per 1k queries (CP1kQ = $3.184) for HyperGraphRAG, with comparisons to all baselines." 297 }, 298 "compute_budget_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Table 3 reports construction costs (TP1kT = 3.084s, CP1kT = $0.0063 per 1k tokens). Hardware is specified as '80-core CPU and 512GB RAM.' Token costs per 1k tokens are provided." 302 } 303 }, 304 "experimental_rigor": { 305 "seed_sensitivity_reported": { 306 "applies": true, 307 "answer": false, 308 "justification": "No mention of random seeds, seed sensitivity analysis, or results across multiple seeds. The LLM temperature is set to 1.0 (Table 4), which introduces stochasticity, but no seed analysis is reported." 309 }, 310 "number_of_runs_stated": { 311 "applies": true, 312 "answer": false, 313 "justification": "The paper does not state how many experimental runs produced the reported results. Given temperature=1.0, outputs are stochastic, making this omission significant." 314 }, 315 "hyperparameter_search_budget": { 316 "applies": true, 317 "answer": false, 318 "justification": "Hyperparameters (kV=60, τV=50, kH=60, τH=5, kC=5, τC=0.5) are stated but no search budget, number of configurations tried, or search method is reported." 319 }, 320 "best_config_selection_justified": { 321 "applies": true, 322 "answer": false, 323 "justification": "Figure 6(a) shows top-k analysis suggesting k=60 as a saturation point, but this appears done on the test set. No validation set is mentioned for configuration selection." 324 }, 325 "multiple_comparison_correction": { 326 "applies": false, 327 "answer": false, 328 "justification": "No statistical tests are performed at all, so there are no multiple comparisons to correct." 329 }, 330 "self_comparison_bias_addressed": { 331 "applies": true, 332 "answer": false, 333 "justification": "The authors implement and evaluate their own system against baselines without acknowledging author-evaluation bias. No independent evaluation or discussion of potential bias in their baseline implementations." 334 }, 335 "compute_budget_vs_performance": { 336 "applies": true, 337 "answer": true, 338 "justification": "Table 3 compares time and cost across all methods for both construction and generation phases. Figure 6(b) compares F1 performance under constrained retrieval token lengths across methods." 339 }, 340 "benchmark_construct_validity": { 341 "applies": true, 342 "answer": false, 343 "justification": "The evaluation benchmark is GPT-generated from knowledge fragments with human verification, but the paper does not discuss whether this benchmark design validly measures real-world QA performance or whether GPT-generated questions introduce systematic biases." 344 }, 345 "scaffold_confound_addressed": { 346 "applies": true, 347 "answer": true, 348 "justification": "All methods use the same unified generation prompt (Figure 10), same generation model (GPT-4o-mini), same temperature (1.0), and same max token length (32k), as stated in Appendix E and Table 4." 349 } 350 }, 351 "data_leakage": { 352 "temporal_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of temporal leakage. The knowledge sources (UltraDomain, 2024 hypertension guidelines) may have been in GPT-4o-mini's training data. No temporal analysis is provided." 356 }, 357 "feature_leakage_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of whether the evaluation setup leaks information. GPT-4o-mini generates both the questions and the answers, which could introduce circular information flow." 361 }, 362 "non_independence_addressed": { 363 "applies": true, 364 "answer": false, 365 "justification": "No discussion of independence between training data and evaluation data. Questions are generated from publicly available knowledge that GPT-4o-mini was likely trained on." 366 }, 367 "leakage_detection_method": { 368 "applies": true, 369 "answer": false, 370 "justification": "No leakage detection or prevention methods are used — no canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines." 371 } 372 } 373 }, 374 "claims": [ 375 { 376 "claim": "HyperGraphRAG consistently outperforms all baselines across F1, R-S, and G-E metrics, with gains of +7.45 (F1), +7.62 (R-S), and +3.69 (G-E) over StandardRAG.", 377 "evidence": "Table 2 shows HyperGraphRAG achieves the highest scores in all 5 domains across all 3 metrics for both Binary Source, N-ary Source, and Overall evaluations (Section 5.2).", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "Hypergraph-structured knowledge representation is more comprehensive than binary graph representation.", 382 "evidence": "Formal information-theoretic proof in Appendix B.1 showing binary representation is lossy for n-ary facts (n≥3), supplemented by Figure 5(f) showing HyperGraphRAG extracts more entities and hyperedges than GraphRAG or LightRAG across all domains.", 383 "supported": "strong" 384 }, 385 { 386 "claim": "Entity retrieval, hyperedge retrieval, and chunk retrieval each independently contribute to HyperGraphRAG's performance.", 387 "evidence": "Ablation study in Figure 4 (Medicine domain only) shows F1 drops from 35.4 to 29.8 (w/o ER), 26.4 (w/o HR), and 29.2 (w/o CR), with combined removal dropping to 12.9.", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "HyperGraphRAG achieves efficient retrieval even under constrained retrieval length budgets, outperforming binary graph methods.", 392 "evidence": "Figure 6(b) shows HyperGraphRAG's F1 exceeds all baselines across retrieval lengths from 32 to 8192 tokens in the Medicine domain.", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "A bipartite graph can losslessly preserve and query a knowledge hypergraph.", 397 "evidence": "Formal proof in Appendix B.2 using incidence matrix representation, showing bijection between hypergraph and bipartite graph with equivalent query semantics.", 398 "supported": "strong" 399 }, 400 { 401 "claim": "HyperGraphRAG achieves a favorable trade-off between construction/generation cost and output quality.", 402 "evidence": "Table 3 shows construction cost of $0.0063/1k tokens (between HippoRAG2 at $0.0056 and LightRAG at $0.0081) and generation cost of $3.184/1k queries (lower than PathRAG at $3.496 and LightRAG at $3.359).", 403 "supported": "moderate" 404 } 405 ], 406 "red_flags": [ 407 { 408 "flag": "LLM-as-judge circular evaluation", 409 "detail": "The G-E metric uses GPT-4o-mini to judge generation quality of outputs also produced by GPT-4o-mini. This creates a potential self-preference bias where the judge may favor outputs that match its own generation style. No human evaluation of system outputs is performed to validate the LLM-judged scores." 410 }, 411 { 412 "flag": "No statistical testing on claims of superiority", 413 "detail": "All claims that HyperGraphRAG 'outperforms' baselines are based on comparing point estimates with no significance tests, confidence intervals, or error bars. With temperature=1.0 and no reported number of runs, the observed differences could be within random variation." 414 }, 415 { 416 "flag": "Apparent single-run results with stochastic model", 417 "detail": "The paper uses GPT-4o-mini with temperature=1.0 but reports no multiple runs, no standard deviations, and no seed sensitivity. LLM outputs at temperature 1.0 are stochastic, so single-run results may not be stable." 418 }, 419 { 420 "flag": "Ablation study limited to single domain", 421 "detail": "The ablation study (Figure 4) is conducted only in the Medicine domain. Given that performance varies across domains in Table 2, component contributions may differ in other domains." 422 }, 423 { 424 "flag": "Author-constructed benchmark with GPT-generated questions", 425 "detail": "The evaluation benchmark questions are generated by GPT from the same knowledge fragments used to build HyperGraphRAG's hypergraph. This creates a risk that the questions are biased toward the types of knowledge structures HyperGraphRAG captures well. All QA pairs are GPT-generated, potentially favoring methods that align with GPT's output distribution." 426 }, 427 { 428 "flag": "No contamination analysis despite using public knowledge sources", 429 "detail": "The knowledge corpora (UltraDomain dataset, international hypertension guidelines) are publicly available and likely in GPT-4o-mini's training data. The NaiveGeneration baseline achieves non-trivial F1 scores (12-23%), suggesting the model has prior knowledge of the test content. No contamination analysis is performed." 430 } 431 ], 432 "cited_papers": [ 433 { 434 "title": "From local to global: A graph rag approach to query-focused summarization", 435 "authors": ["Darren Edge", "Ha Trinh", "Newman Cheng"], 436 "year": 2024, 437 "relevance": "Foundational GraphRAG method that structures knowledge as graphs for RAG; primary baseline in this work." 438 }, 439 { 440 "title": "LightRAG: Simple and fast retrieval-augmented generation", 441 "authors": ["Zirui Guo", "Lianghao Xia", "Yanhua Yu"], 442 "year": 2024, 443 "relevance": "Graph-based RAG baseline that enhances efficiency via graph indexing and updates." 444 }, 445 { 446 "title": "PathRAG: Pruning graph-based retrieval augmented generation with relational paths", 447 "authors": ["Boyu Chen", "Zirui Guo", "Zidan Yang"], 448 "year": 2025, 449 "relevance": "Graph-based RAG method using path pruning for retrieval, serving as a contemporary baseline." 450 }, 451 { 452 "title": "From RAG to memory: Non-parametric continual learning for large language models", 453 "authors": ["Bernal Jiménez Gutiérrez", "Yiheng Shu", "Weijian Qi"], 454 "year": 2025, 455 "relevance": "HippoRAG2 method using Personalized PageRank for graph-based RAG retrieval; strongest baseline." 456 }, 457 { 458 "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks", 459 "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"], 460 "year": 2020, 461 "relevance": "Foundational RAG paper establishing the retrieval-augmented generation paradigm for knowledge-intensive tasks." 462 }, 463 { 464 "title": "GPT-4 technical report", 465 "authors": ["OpenAI"], 466 "year": 2024, 467 "relevance": "Technical report for the GPT-4 model family, of which GPT-4o-mini (used in this paper) is a variant." 468 }, 469 { 470 "title": "Retrieval-augmented generation for large language models: A survey", 471 "authors": ["Yunfan Gao", "Yun Xiong", "Xinyu Gao"], 472 "year": 2024, 473 "relevance": "Comprehensive survey of RAG methods for LLMs, providing context for the RAG landscape." 474 }, 475 { 476 "title": "FlashRAG: A modular toolkit for efficient retrieval-augmented generation research", 477 "authors": ["Jiajie Jin", "Yutao Zhu", "Xinyu Yang"], 478 "year": 2024, 479 "relevance": "RAG evaluation toolkit whose F1 metric methodology is adopted in this paper." 480 }, 481 { 482 "title": "RAGAs: Automated evaluation of retrieval augmented generation", 483 "authors": ["Shahul Es", "Jithin James", "Luis Espinosa Anke"], 484 "year": 2024, 485 "relevance": "RAG evaluation framework whose retrieval similarity metric inspired the R-S metric used in this paper." 486 }, 487 { 488 "title": "Unifying large language models and knowledge graphs: A roadmap", 489 "authors": ["Shirui Pan", "Linhao Luo", "Yufei Wang"], 490 "year": 2024, 491 "relevance": "Surveys the integration of LLMs and knowledge graphs, providing context for graph-enhanced LLM generation." 492 }, 493 { 494 "title": "A survey of large language models", 495 "authors": ["Wayne Xin Zhao", "Kun Zhou", "Junyi Li"], 496 "year": 2024, 497 "relevance": "Comprehensive survey of LLMs providing context for model capabilities used in RAG systems." 498 }, 499 { 500 "title": "MemoRAG: Moving towards next-gen RAG via memory-inspired knowledge discovery", 501 "authors": ["Hongjin Qian", "Peitian Zhang", "Zheng Liu"], 502 "year": 2024, 503 "relevance": "Source of the UltraDomain benchmark dataset used for evaluation in this paper." 504 } 505 ], 506 "engagement_factors": { 507 "practical_relevance": { 508 "score": 2, 509 "justification": "RAG is widely deployed in practice, and hypergraph-based knowledge representation could improve knowledge-intensive applications, with code publicly available." 510 }, 511 "surprise_contrarian": { 512 "score": 1, 513 "justification": "Challenges the binary-graph assumption in GraphRAG by proposing hypergraphs, but the idea that richer representations improve retrieval is not deeply surprising." 514 }, 515 "fear_safety": { 516 "score": 0, 517 "justification": "No safety, security, or AI risk concerns are raised by this work." 518 }, 519 "drama_conflict": { 520 "score": 0, 521 "justification": "No controversy or dramatic claims; a straightforward methodological contribution." 522 }, 523 "demo_ability": { 524 "score": 2, 525 "justification": "Code and data are publicly available on GitHub, allowing practitioners to try the approach." 526 }, 527 "brand_recognition": { 528 "score": 1, 529 "justification": "Published at NeurIPS 2025 (prestigious venue) but from university labs without major brand recognition." 530 } 531 } 532 }