scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28906B)
      1 {
      2   "paper": {
      3     "title": "HyperGraphRAG: Retrieval-Augmented Generation via Hypergraph-Structured Knowledge Representation",
      4     "authors": [
      5       "Haoran Luo",
      6       "Haihong E",
      7       "Guanting Chen",
      8       "Yandan Zheng",
      9       "Xiaobao Wu",
     10       "Yikai Guo",
     11       "Qika Lin",
     12       "Yu Feng",
     13       "Zemin Kuang",
     14       "Meina Song",
     15       "Yifan Zhu",
     16       "Luu Anh Tuan"
     17     ],
     18     "year": 2025,
     19     "venue": "NeurIPS 2025",
     20     "arxiv_id": "2503.21322",
     21     "doi": "10.48550/arXiv.2503.21322"
     22   },
     23   "scan_version": 3,
     24   "active_modules": ["experimental_rigor", "data_leakage"],
     25   "methodology_tags": ["benchmark-eval", "theoretical"],
     26   "key_findings": "HyperGraphRAG proposes a hypergraph-based RAG framework that represents n-ary relational facts via hyperedges rather than binary edges. Across five domains (medicine, agriculture, CS, legal, mix), it outperforms six baselines including GraphRAG, LightRAG, PathRAG, and HippoRAG2 on F1, retrieval similarity, and generation evaluation metrics. Ablation shows entity retrieval, hyperedge retrieval, and chunk retrieval fusion each contribute meaningfully. The paper includes information-theoretic proofs that hypergraph representation preserves more information than binary graph representation for n-ary facts.",
     27   "checklist": {
     28     "artifacts": {
     29       "code_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper states 'Our data and code are publicly available' with footnote linking to https://github.com/LHRLAB/HyperGraphRAG."
     33       },
     34       "data_released": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The paper states 'Our data and code are publicly available.' They also use publicly available datasets from UltraDomain and public hypertension guidelines."
     38       },
     39       "environment_specified": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper mentions '80-core CPU and 512GB RAM' and specifies the models used (GPT-4o-mini, text-embedding-3-small) but provides no requirements.txt, Dockerfile, or detailed dependency listing."
     43       },
     44       "reproduction_instructions": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but the paper itself does not include a 'Reproducing Results' section or runnable commands."
     48       }
     49     },
     50     "statistical_methodology": {
     51       "confidence_intervals_or_error_bars": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Table 2 and all other results tables report only point estimates with no confidence intervals, error bars, or ± notation."
     55       },
     56       "significance_tests": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The paper claims 'HyperGraphRAG consistently outperforms all baselines' based solely on comparing numerical values across Table 2 without any statistical significance tests."
     60       },
     61       "effect_sizes_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Section 5.2 reports absolute improvements with baseline context: 'Compared to StandardRAG, it achieves gains of +7.45 (F1), +7.62 (R-S), and +3.69 (G-E).' Per-source-type improvements are also quantified."
     65       },
     66       "sample_size_justified": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The evaluation uses 512 questions per domain (256 binary + 256 n-ary, Appendix D) but no justification is given for why this number is sufficient."
     70       },
     71       "variance_reported": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No variance, standard deviation, or spread measures are reported. All results appear to be from single runs with no indication of result stability."
     75       }
     76     },
     77     "evaluation_design": {
     78       "baselines_included": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Six baselines are compared: NaiveGeneration, StandardRAG, GraphRAG, LightRAG, PathRAG, and HippoRAG2 (Table 2)."
     82       },
     83       "baselines_contemporary": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Baselines include LightRAG (2024), PathRAG (2025), and HippoRAG2 (2025), which are all recent and represent state-of-the-art graph-based RAG methods."
     87       },
     88       "ablation_study": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Figure 4 presents an ablation study removing entity retrieval (ER), hyperedge retrieval (HR), chunk retrieval fusion (CR), and their combinations."
     92       },
     93       "multiple_metrics": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Three metrics are used: F1 (word-level accuracy), R-S (retrieval similarity), and G-E (generation evaluation combining 7 LLM-judged dimensions with F1)."
     97       },
     98       "human_evaluation": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "Evaluation of system outputs is entirely automated. G-E uses GPT-4o-mini as LLM judge. Human annotators verified the ground-truth question-answer pairs (Appendix D), but did not evaluate the system's outputs."
    102       },
    103       "held_out_test_set": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The evaluation questions are constructed independently via sampling from the knowledge graph and are used solely for testing, not for any tuning or development decisions."
    107       },
    108       "per_category_breakdown": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Table 2 provides results broken down by domain (Medicine, Agriculture, CS, Legal, Mix) and by source type (Binary Source, N-ary Source, Overall)."
    112       },
    113       "failure_cases_discussed": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper does not discuss failure cases or error analysis. The case study in Table 5 (Appendix H) shows only a success scenario where HyperGraphRAG gets the correct answer while all baselines fail."
    117       },
    118       "negative_results_reported": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "Every experiment shows HyperGraphRAG outperforming all baselines. No approaches that were tried and failed are reported. The ablation results show expected degradation when removing components, which is not the same as reporting negative results."
    122       }
    123     },
    124     "claims_and_evidence": {
    125       "abstract_claims_supported": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The abstract claims HyperGraphRAG 'outperforms both standard RAG and previous graph-based RAG methods in answer accuracy, retrieval efficiency, and generation quality.' Table 2 supports all three claims across all domains."
    129       },
    130       "causal_claims_justified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Causal claims are made through ablation studies (Figure 4, 'removing entity retrieval drops F1 from 35.4 to 29.8'). The ablation design involves controlled single-variable removal, which is adequate for the claims made."
    134       },
    135       "generalization_bounded": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The abstract and conclusion make broad claims about 'knowledge-intensive applications' and 'real-world applications' generally, while results are limited to 5 specific domains using a single LLM (GPT-4o-mini) and a single embedding model. The title does not bound to the tested setting."
    139       },
    140       "alternative_explanations_discussed": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No alternative explanations for the results are discussed. The paper does not consider confounds such as the possibility that hypergraph extraction simply produces more context tokens (and thus more information) regardless of structure."
    144       },
    145       "proxy_outcome_distinction": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "F1 (word overlap) is used as a proxy for 'answer accuracy,' R-S (embedding similarity) for 'retrieval efficiency,' and G-E (LLM judge) for 'generation quality.' The paper does not discuss the gap between these proxies and the constructs they claim to measure, particularly the validity of using GPT-4o-mini to judge outputs generated by GPT-4o-mini."
    149       }
    150     },
    151     "setup_transparency": {
    152       "model_versions_specified": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The paper specifies 'GPT-4o-mini' and 'text-embedding-3-small' but provides no snapshot dates or API versions. GPT-4o-mini is a marketing name whose behavior changes across versions."
    156       },
    157       "prompts_provided": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Full prompt text is provided in Appendix A: n-ary relation extraction prompt (Figure 8), entity extraction prompt (Figure 9), and generation prompt (Figure 10). G-E evaluation prompts are also given (Figures 11-12)."
    161       },
    162       "hyperparameters_reported": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 5.1 and Table 4 report retrieval parameters (kV=60, τV=50, kH=60, τH=5, kC=5, τC=0.5), temperature 1.0, max generation length 32k tokens, and 16 parallel cores."
    166       },
    167       "scaffolding_described": {
    168         "applies": false,
    169         "answer": false,
    170         "justification": "HyperGraphRAG is a retrieval pipeline (extract → store → retrieve → generate), not an agentic scaffold with loops, tool use, or retry logic."
    171       },
    172       "data_preprocessing_documented": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Appendix D documents the dataset construction: domain sources, question sampling strategy (1/2/3-hop traversal with specific counts), binary vs n-ary split, GPT-based question generation, and manual verification."
    176       }
    177     },
    178     "limitations_and_scope": {
    179       "limitations_section_present": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Appendix I provides a 'Limitations and Future Work' section with five subsections covering multimodal extension, RL integration, federated learning, foundation model development, and scaling to harder tasks."
    183       },
    184       "threats_to_validity_specific": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The limitations section (Appendix I) discusses only future work directions (multimodal, RL, federated, foundation model, scaling). No specific threats to the current study's validity are discussed, such as the single-model evaluation, benchmark generation bias, or LLM-as-judge circularity."
    188       },
    189       "scope_boundaries_stated": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The paper does not explicitly state what the results do NOT show. The conclusion broadly claims 'effectiveness and generalizability' without bounding to the specific five domains and single LLM tested."
    193       }
    194     },
    195     "data_integrity": {
    196       "raw_data_available": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The paper claims 'Our data and code are publicly available' at the GitHub repository, and uses publicly available UltraDomain datasets and published hypertension guidelines."
    200       },
    201       "data_collection_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Appendix D describes domain sources (UltraDomain for 4 domains, ESC hypertension guidelines for medicine), question sampling via multi-hop traversal, and human verification process."
    205       },
    206       "recruitment_methods_described": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "No human participants in the evaluation. Data comes from standard public datasets and published guidelines."
    210       },
    211       "data_pipeline_documented": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Appendix D documents the full pipeline: domain selection → knowledge fragment sampling (1/2/3 hop) → question generation via GPT → human verification. Exact counts per hop level are provided (128 + 64 + 64 per source type)."
    215       }
    216     },
    217     "conflicts_of_interest": {
    218       "funding_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Acknowledgments section discloses funding from the National Natural Science Foundation of China (Grants 62473271, 62176026, 62406036) and the Engineering Research Center of Information Networks, Ministry of Education, China."
    222       },
    223       "affiliations_disclosed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "All author affiliations are listed: BUPT, NTU, Beijing Institute of Computer Technology, NUS, China Mobile Research Institute, Beijing Anzhen Hospital. No product from the authors' institutions is being evaluated."
    227       },
    228       "funder_independent_of_outcome": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Funding is from NSFC (Chinese government science foundation) and Ministry of Education, which have no financial stake in whether HyperGraphRAG outperforms baselines."
    232       },
    233       "financial_interests_declared": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No competing interests or financial interests statement is present in the paper."
    237       }
    238     },
    239     "contamination": {
    240       "training_cutoff_stated": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "GPT-4o-mini is used for both extraction and generation, but no training data cutoff date is stated. The UltraDomain and hypertension guidelines could be in GPT-4o-mini's training data."
    244       },
    245       "train_test_overlap_discussed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No discussion of whether GPT-4o-mini may have seen UltraDomain content or the ESC hypertension guidelines during training, which could affect absolute performance numbers."
    249       },
    250       "benchmark_contamination_addressed": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "UltraDomain is publicly available and could be in GPT-4o-mini's training data. The questions are newly generated but from potentially contaminated source material. This is not discussed."
    254       }
    255     },
    256     "human_studies": {
    257       "pre_registered": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "irb_or_ethics_approval": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "demographics_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "inclusion_exclusion_criteria": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "randomization_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "blinding_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       },
    287       "attrition_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human participants in this study."
    291       }
    292     },
    293     "cost_and_practicality": {
    294       "inference_cost_reported": {
    295         "applies": true,
    296         "answer": true,
    297         "justification": "Table 3 reports time per 1k tokens (TP1kT), cost per 1k tokens (CP1kT), time per query (TPQ=0.256s), and cost per 1k queries (CP1kQ=$3.184) for all methods."
    298       },
    299       "compute_budget_stated": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "Hardware is mentioned (80-core CPU, 512GB RAM) and per-unit costs are in Table 3, but total computational budget (total API spend, total experiment time across all domains and baselines) is not stated."
    303       }
    304     },
    305     "experimental_rigor": {
    306       "seed_sensitivity_reported": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No mention of random seeds or results across multiple runs. All reported numbers appear to be single-run results."
    310       },
    311       "number_of_runs_stated": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The number of experimental runs is never stated. Results are presented without indicating how many runs produced them."
    315       },
    316       "hyperparameter_search_budget": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "Hyperparameters (k=60, thresholds, etc.) appear chosen but no search budget, search method, or number of configurations tried is reported. Figure 6(a) shows top-k impact but not a systematic search."
    320       },
    321       "best_config_selection_justified": {
    322         "applies": true,
    323         "answer": true,
    324         "justification": "Figure 6(a) shows how performance varies with top-k, with the paper noting 'Performance saturates around k = 60,' providing empirical justification for the chosen configuration."
    325       },
    326       "multiple_comparison_correction": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The paper makes many pairwise comparisons across 7 methods × 5 domains × 3 metrics without any statistical tests, let alone multiple comparison corrections."
    330       },
    331       "self_comparison_bias_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The authors compare their system against their own implementations of baselines (using publicly available code) without acknowledging the bias documented by Lucic et al. (2018) that authors' implementations of baselines systematically underperform."
    335       },
    336       "compute_budget_vs_performance": {
    337         "applies": true,
    338         "answer": true,
    339         "justification": "Table 3 explicitly compares construction time/cost and generation time/cost across all methods alongside their performance, and Section 5.7 discusses the trade-offs."
    340       },
    341       "benchmark_construct_validity": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "The paper does not discuss whether F1 (word overlap), R-S (embedding similarity), or G-E (LLM judge) actually measure 'answer accuracy,' 'retrieval efficiency,' and 'generation quality.' The validity of using GPT-4o-mini to judge its own outputs is not questioned."
    345       },
    346       "scaffold_confound_addressed": {
    347         "applies": true,
    348         "answer": true,
    349         "justification": "All methods use the same generation model (GPT-4o-mini) and the same unified generation prompt (Figure 10, Appendix E). Table 4 confirms identical generation settings. The comparison isolates the retrieval/knowledge representation strategy."
    350       }
    351     },
    352     "data_leakage": {
    353       "temporal_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether GPT-4o-mini's training data includes the UltraDomain corpus or the 2024 ESC hypertension guidelines, which would constitute temporal leakage."
    357       },
    358       "feature_leakage_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "The questions are generated from sampled knowledge fragments. No discussion of whether the question generation process or retrieval setup leaks answer information."
    362       },
    363       "non_independence_addressed": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No discussion of whether the constructed questions are independent of each other or whether overlapping knowledge fragments create dependencies between test examples."
    367       },
    368       "leakage_detection_method": {
    369         "applies": true,
    370         "answer": false,
    371         "justification": "No concrete leakage detection or prevention method is applied, such as canary strings, membership inference, or decontamination analysis."
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "HyperGraphRAG consistently outperforms all baselines across F1, R-S, and G-E metrics, with gains of +7.45 F1, +7.62 R-S, and +3.69 G-E over StandardRAG.",
    378       "evidence": "Table 2 shows overall results across 5 domains. HyperGraphRAG achieves the highest scores in all three metrics in all domains. Section 5.2 quantifies the gains.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Hypergraph-structured knowledge representation is more comprehensive than binary graph representation, preserving more information in the information-theoretic sense.",
    383       "evidence": "Proposition 1 with formal proof in Appendix B.1 using Shannon entropy and mutual information. Figure 5(f) shows HyperGraphRAG extracts more entities and hyperedges than GraphRAG/LightRAG extract entities/relations.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Each component (entity retrieval, hyperedge retrieval, chunk retrieval fusion) contributes meaningfully to HyperGraphRAG's performance.",
    388       "evidence": "Figure 4 ablation in Medicine domain: removing ER drops F1 from 35.4 to 29.8, removing HR drops to 26.4, removing CR drops to 29.2.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "HyperGraphRAG outperforms binary graph-based methods even under retrieval length constraints.",
    393       "evidence": "Figure 6(b) shows F1 comparison under limited retrieved knowledge lengths from 32 to 8192 tokens in the Medicine domain. HyperGraphRAG leads at most length budgets.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "HyperGraphRAG achieves a favorable trade-off between construction/generation efficiency and output quality.",
    398       "evidence": "Table 3 reports construction time of 3.084s/1kT and generation cost of $3.184/1kQ, positioned between the cheapest and most expensive baselines while achieving the best quality metrics.",
    399       "supported": "moderate"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "LLM-as-judge circularity",
    405       "detail": "The G-E metric uses GPT-4o-mini as judge to evaluate outputs generated by GPT-4o-mini. This creates a self-evaluation bias where the model may prefer its own generation style. The paper does not acknowledge or mitigate this."
    406     },
    407     {
    408       "flag": "No error bars or multiple runs",
    409       "detail": "All results across 5 domains × 7 methods × 3 metrics appear to be single-run point estimates with no variance, standard deviation, or confidence intervals reported. Result stability is unknown."
    410     },
    411     {
    412       "flag": "No significance tests despite strong claims",
    413       "detail": "Claims of consistent outperformance are based entirely on numerical comparisons without any statistical significance tests. Some F1 differences are small (e.g., 31.30 vs 28.93 in CS Overall) and may not be significant."
    414     },
    415     {
    416       "flag": "GPT-generated benchmark",
    417       "detail": "Evaluation questions and golden answers are generated by GPT and then verified by human annotators. This process may introduce systematic biases that favor methods using GPT-4o-mini for generation, as the golden answers may reflect GPT's style."
    418     },
    419     {
    420       "flag": "Ablation limited to one domain",
    421       "detail": "The ablation study (Figure 4) is conducted only in the Medicine domain. Component contributions may differ across domains, but this is not verified."
    422     },
    423     {
    424       "flag": "Existing graph-based baselines underperform StandardRAG",
    425       "detail": "The paper notes that 'existing graph-based RAG baselines often underperform StandardRAG' (Section 5.2), which raises questions about implementation fairness — if the baselines are not performing at expected levels, the comparison may be skewed."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "From local to global: A graph rag approach to query-focused summarization",
    431       "authors": ["Darren Edge", "Ha Trinh", "Newman Cheng", "Joshua Bradley", "Alex Chao", "Apurva Mody", "Steven Truitt", "Jonathan Larson"],
    432       "year": 2024,
    433       "relevance": "Foundational graph-based RAG method that structures knowledge as community summaries, serving as a key baseline."
    434     },
    435     {
    436       "title": "LightRAG: Simple and fast retrieval-augmented generation",
    437       "authors": ["Zirui Guo", "Lianghao Xia", "Yanhua Yu", "Tu Ao", "Chao Huang"],
    438       "year": 2024,
    439       "relevance": "Efficient graph-based RAG using graph indexing; key baseline and source of evaluation datasets."
    440     },
    441     {
    442       "title": "PathRAG: Pruning graph-based retrieval augmented generation with relational paths",
    443       "authors": ["Boyu Chen", "Zirui Guo", "Zidan Yang"],
    444       "year": 2025,
    445       "relevance": "Graph-based RAG using path pruning for retrieval; contemporary baseline comparison."
    446     },
    447     {
    448       "title": "From RAG to memory: Non-parametric continual learning for large language models",
    449       "authors": ["Bernal Jiménez Gutiérrez", "Yiheng Shu", "Weijian Qi", "Sizhe Zhou", "Yu Su"],
    450       "year": 2025,
    451       "relevance": "HippoRAG2 extends graph-based RAG with Personalized PageRank retrieval; contemporary baseline."
    452     },
    453     {
    454       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    455       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"],
    456       "year": 2020,
    457       "relevance": "Seminal RAG paper establishing the retrieval-augmented generation paradigm."
    458     },
    459     {
    460       "title": "FlashRAG: A modular toolkit for efficient retrieval-augmented generation research",
    461       "authors": ["Jiajie Jin", "Yutao Zhu", "Xinyu Yang", "Chenghao Zhang", "Zhicheng Dou"],
    462       "year": 2024,
    463       "relevance": "RAG evaluation toolkit providing the F1 metric methodology used in this paper."
    464     },
    465     {
    466       "title": "RAGAs: Automated evaluation of retrieval augmented generation",
    467       "authors": ["Shahul Es", "Jithin James", "Luis Espinosa Anke", "Steven Schockaert"],
    468       "year": 2024,
    469       "relevance": "Provides the retrieval similarity metric methodology adapted for evaluation in this paper."
    470     },
    471     {
    472       "title": "HelloBench: Evaluating long text generation capabilities of large language models",
    473       "authors": ["Haoran Que", "Feiyu Duan", "Liqun He"],
    474       "year": 2024,
    475       "relevance": "Source of the generation evaluation (G-E) metric using LLM-as-judge across 7 dimensions."
    476     },
    477     {
    478       "title": "MiniRAG: Towards extremely simple retrieval-augmented generation",
    479       "authors": ["Tianyu Fan", "Jingyuan Wang", "Xubin Ren", "Chao Huang"],
    480       "year": 2025,
    481       "relevance": "Contemporary simple RAG approach extending graph-based retrieval methods."
    482     },
    483     {
    484       "title": "KAG: Boosting LLMs in professional domains via knowledge augmented generation",
    485       "authors": ["Lei Liang", "Mengshu Sun", "Zhengke Gui"],
    486       "year": 2024,
    487       "relevance": "Knowledge augmented generation method for professional domains, related approach to domain-specific RAG."
    488     },
    489     {
    490       "title": "MemoRAG: Moving towards next-gen RAG via memory-inspired knowledge discovery",
    491       "authors": ["Hongjin Qian", "Peitian Zhang", "Zheng Liu", "Kelong Mao", "Zhicheng Dou"],
    492       "year": 2024,
    493       "relevance": "Source of the UltraDomain dataset used for evaluation across 4 of the 5 experimental domains."
    494     },
    495     {
    496       "title": "Unifying large language models and knowledge graphs: A roadmap",
    497       "authors": ["Shirui Pan", "Linhao Luo", "Yufei Wang"],
    498       "year": 2024,
    499       "relevance": "Survey on integrating LLMs with knowledge graphs, providing context for graph-based RAG approaches."
    500     }
    501   ],
    502   "engagement_factors": {
    503     "practical_relevance": {
    504       "score": 2,
    505       "justification": "RAG is a widely-used pattern and hypergraph extension is directly applicable, with code released on GitHub."
    506     },
    507     "surprise_contrarian": {
    508       "score": 1,
    509       "justification": "Extending binary graphs to hypergraphs is a natural progression rather than a surprising challenge to conventional wisdom."
    510     },
    511     "fear_safety": {
    512       "score": 0,
    513       "justification": "No safety or security concerns raised by the work."
    514     },
    515     "drama_conflict": {
    516       "score": 0,
    517       "justification": "No controversy; a straightforward technical contribution."
    518     },
    519     "demo_ability": {
    520       "score": 2,
    521       "justification": "Code is publicly available on GitHub, so practitioners could try it, though it requires API keys and setup."
    522     },
    523     "brand_recognition": {
    524       "score": 1,
    525       "justification": "Published at NeurIPS (prestigious venue) but from relatively unknown research groups (BUPT, NTU)."
    526     }
    527   }
    528 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs