scan.json (25573B)
1 { 2 "paper": { 3 "title": "Beyond Chunks and Graphs: Retrieval-Augmented Generation through Triplet-Driven Thinking", 4 "authors": ["Shengbo Gong", "Xianfeng Tang", "Carl Yang", "Wei Jin"], 5 "year": 2025, 6 "arxiv_id": "2508.02435" 7 }, 8 "checklist": { 9 "artifacts": { 10 "code_released": { 11 "applies": true, 12 "answer": true, 13 "justification": "The abstract states 'Our code is available at https://github.com/rockcor/T2RAG.' This is a working URL provided in the paper." 14 }, 15 "data_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "The paper uses publicly available benchmark datasets: PopQA, 2Wiki-MultihopQA, MuSiQue, HotpotQA, and datasets from GraphRAG-Bench. These are standard public benchmarks. The paper also follows the same 1,000-question samples from prior work (HippoRAG2)." 19 }, 20 "environment_specified": { 21 "applies": true, 22 "answer": false, 23 "justification": "The paper mentions 'a single NVIDIA L40S GPU' for embedding and 'temperature to 0' but does not provide a requirements.txt, Dockerfile, or detailed list of library versions. This is not enough to recreate the environment." 24 }, 25 "reproduction_instructions": { 26 "applies": true, 27 "answer": false, 28 "justification": "While code is released on GitHub, the paper itself does not contain step-by-step reproduction instructions, a README with commands, or a 'Reproducing Results' section. The pseudocode (Algorithms 1-2) describes the method but not how to run the experiments." 29 } 30 }, 31 "statistical_methodology": { 32 "confidence_intervals_or_error_bars": { 33 "applies": true, 34 "answer": false, 35 "justification": "Tables 1 and 2 report only point estimates (e.g., '56.6 / 62.4') with no confidence intervals, error bars, or ± notation." 36 }, 37 "significance_tests": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper claims T2RAG 'significantly outperforms' baselines and 'outperforms the multi-round baseline, IRCoT, by over 7.7%' but provides no statistical significance tests (no p-values, t-tests, or bootstrap tests). Comparative claims are based solely on comparing point estimates." 41 }, 42 "effect_sizes_reported": { 43 "applies": true, 44 "answer": true, 45 "justification": "The paper reports percentage improvements with baseline context throughout. For example, 'an average performance gain of up to 11%', 'reducing retrieval costs by up to 45%', and 'outperforms the multi-round baseline, IRCoT, by over 7.7% and 5.4% in EM.' Ablation results show relative drops (e.g., '↓54.5%'). The raw scores and baselines are provided in Table 1." 46 }, 47 "sample_size_justified": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper uses 1,000 questions per dataset for most benchmarks and 794/564 for domain-specific sets, following prior work conventions, but does not justify why these sample sizes are adequate or discuss statistical power." 51 }, 52 "variance_reported": { 53 "applies": true, 54 "answer": false, 55 "justification": "No standard deviations, variance, or spread measures are reported across runs. The paper sets temperature to 0 for deterministic output but does not discuss variance across different random seeds, embedding initializations, or other sources of non-determinism." 56 } 57 }, 58 "evaluation_design": { 59 "baselines_included": { 60 "applies": true, 61 "answer": true, 62 "justification": "The paper compares against multiple baselines: NOR (no retrieval), BM25, Standard RAG, HippoRAG2 (Graph RAG), RAPTOR (summarization-based), and IRCoT (multi-round RAG), as shown in Table 1." 63 }, 64 "baselines_contemporary": { 65 "applies": true, 66 "answer": true, 67 "justification": "The baselines are contemporary and competitive: HippoRAG2 (2025), RAPTOR (2024), IRCoT (2023), LightRAG (2024). These represent recent state-of-the-art methods across different RAG paradigms." 68 }, 69 "ablation_study": { 70 "applies": true, 71 "answer": true, 72 "justification": "Table 2 presents an ablation study with two variants: '- single round' (removing iterative resolution) and '- w/o chunk' (removing raw chunk text), showing the contribution of each component on three datasets." 73 }, 74 "multiple_metrics": { 75 "applies": true, 76 "answer": true, 77 "justification": "The paper reports two metrics: Exact Match (EM) and F1 scores, as shown in Table 1. Additionally, computational efficiency is evaluated via token consumption and wall-clock time." 78 }, 79 "human_evaluation": { 80 "applies": true, 81 "answer": false, 82 "justification": "No human evaluation is included. All evaluation is automated using EM and F1 scores against ground-truth answers. For a QA system, human evaluation of answer quality or relevance could strengthen the claims." 83 }, 84 "held_out_test_set": { 85 "applies": true, 86 "answer": true, 87 "justification": "The paper uses established benchmark test sets (PopQA, 2Wiki, MuSiQue, HotpotQA, GraphRAG-Bench). The datasets come with predefined test splits, and the paper follows the same 1,000-question samples used in prior work." 88 }, 89 "per_category_breakdown": { 90 "applies": true, 91 "answer": true, 92 "justification": "Table 1 provides per-dataset breakdowns across three categories (Simple QA, Multi-hop QA, Domain-Specific QA) with six individual datasets. Figure 3 breaks down performance by resolution status (resolved vs. unresolved)." 93 }, 94 "failure_cases_discussed": { 95 "applies": true, 96 "answer": true, 97 "justification": "Figure 3 analyzes failure cases by showing performance when triplets are not fully resolved. The Limitations section discusses when the method fails: 'simple triplets may not adequately represent complex knowledge like many-to-many relationships.' The paper also notes T2RAG's lower performance on MuSiQue compared to some baselines." 98 }, 99 "negative_results_reported": { 100 "applies": true, 101 "answer": true, 102 "justification": "The paper reports that T2RAG does not always win: on MuSiQue with GPT-4o-mini, T2RAG (34.3/45.6) is lower than HippoRAG2 (34.1/48.1 in F1) and comparable to IRCoT. With GPT-4o-mini, T2RAG's F1 average (60.2) is second to IRCoT (58.8) but HippoRAG2 (61.1) is close. The ablation also shows negative impact of removing components." 103 } 104 }, 105 "claims_and_evidence": { 106 "abstract_claims_supported": { 107 "applies": true, 108 "answer": true, 109 "justification": "The abstract claims 'an average performance gain of up to 11% across six datasets while reducing retrieval costs by up to 45%.' Table 1 shows T2RAG leading in average EM/F1, and Figure 4 shows token consumption reductions. The 'up to' hedging is supported by the data, though the average gain varies by LLM backbone." 110 }, 111 "causal_claims_justified": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper makes causal claims through ablation studies (Table 2): removing the iterative module or chunk text causes performance drops. These are controlled single-variable manipulations that adequately support the causal claims about component contributions." 115 }, 116 "generalization_bounded": { 117 "applies": true, 118 "answer": false, 119 "justification": "The title and abstract claim the method moves 'Beyond Chunks and Graphs' for RAG generally, but results are limited to factoid QA tasks with exact-match evaluation on six datasets. The Limitations section acknowledges 'we limited our multi-round methods to 3 iterations' and did not test other embedding models, but the broad framing of the title and abstract exceeds the tested scope." 120 }, 121 "alternative_explanations_discussed": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper does not substantively discuss alternative explanations for its results. For example, it does not consider whether the improvements come from the triplet extraction LLM call (additional compute) rather than the triplet representation itself, or whether the answer format standardization (mentioned in Appendix B.1) could explain some of the baseline improvements." 125 } 126 }, 127 "setup_transparency": { 128 "model_versions_specified": { 129 "applies": true, 130 "answer": false, 131 "justification": "The paper uses 'Gemini-2.5-flash' and 'GPT-4o-mini' without specifying snapshot dates or API versions. 'NV-Embed-v2' is named for embeddings. Marketing names without snapshot dates do not count as specified versions per the schema criteria." 132 }, 133 "prompts_provided": { 134 "applies": true, 135 "answer": true, 136 "justification": "Appendix E provides the full prompt templates for all three LLM stages: Structured Query Decomposition, Triplets Resolving, and Final Answering. The prompts include examples and exact instruction text with placeholder notation for variable content." 137 }, 138 "hyperparameters_reported": { 139 "applies": true, 140 "answer": true, 141 "justification": "Section 5.2 and Appendix B.1 report key hyperparameters: temperature=0, top-k=5 for chunk retrieval, N=3 maximum iterations, chunk size=1200 tokens with 100-token overlap. RAPTOR cluster size=10, level=3. HippoRAG2 uses default parameters." 142 }, 143 "scaffolding_described": { 144 "applies": true, 145 "answer": true, 146 "justification": "The paper describes the agentic scaffolding in detail: the three-step pipeline (decomposition, iterative resolution, final answering), the stopping condition, the state update mechanism, and the adaptive retrieval strategy. Algorithms 1 and 2 provide pseudocode. Figure 2 shows the workflow." 147 }, 148 "data_preprocessing_documented": { 149 "applies": true, 150 "answer": true, 151 "justification": "Section 5.1 describes dataset selection and adaptation: following the HippoRAG2 setup for the first four datasets, using the same 1,000-question samples. For domain-specific datasets, they 'isolate the factoid questions' from GraphRAG-Bench and 'use an LLM to shorten the ground-truth answers.' Chunking strategy (1200 tokens, 100-token overlap) is documented. Table 3 provides detailed dataset statistics." 152 } 153 }, 154 "limitations_and_scope": { 155 "limitations_section_present": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 7 is a dedicated 'Limitations' section that discusses experimental constraints (iteration limits, embedding model coverage, lack of recall metrics), methodological limitations (triplet extraction quality, many-to-many relationships), and scalability concerns." 159 }, 160 "threats_to_validity_specific": { 161 "applies": true, 162 "answer": true, 163 "justification": "The Limitations section discusses specific threats: 'we limited our multi-round methods to 3 iterations to match the complexity of the datasets', 'we did not have the resources to test on other embedding models especially LLM-based ones, re-rankers or large external knowledge graphs', and 'simple triplets may not adequately represent complex knowledge like many-to-many relationships.' These are specific to this study." 164 }, 165 "scope_boundaries_stated": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 2 explicitly states 'this paper focuses on advancing the state-of-the-art in factoid QA' and defines the exact task scope: 'The answer must be exact one entity in our setting, such as persons, organizations, or locations or yes/no.' The Limitations section adds further boundaries about untested models and embedding approaches." 169 } 170 }, 171 "data_integrity": { 172 "raw_data_available": { 173 "applies": true, 174 "answer": true, 175 "justification": "The datasets used (PopQA, 2Wiki, MuSiQue, HotpotQA, GraphRAG-Bench) are all publicly available benchmarks. The code repository is provided. These together allow independent verification of the raw data and results pipeline." 176 }, 177 "data_collection_described": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section 5.1 describes the data selection: datasets were chosen for three ODQA categories, following the HippoRAG2 experimental setup for the first four. For domain-specific evaluation, factoid questions were isolated from GraphRAG-Bench with LLM-shortened answers. Table 3 provides detailed statistics." 181 }, 182 "recruitment_methods_described": { 183 "applies": false, 184 "answer": false, 185 "justification": "No human participants. The data source is standard public benchmarks." 186 }, 187 "data_pipeline_documented": { 188 "applies": true, 189 "answer": true, 190 "justification": "The offline indexing pipeline is documented: corpus → chunking (1200 tokens, 100-token overlap) → triplet extraction via LLM → verbalization → embedding → FAISS indexing. The online retrieval pipeline is documented via Algorithms 1-2 and Section 4.3. Dataset adaptation steps are described in Section 5.1." 191 } 192 }, 193 "conflicts_of_interest": { 194 "funding_disclosed": { 195 "applies": true, 196 "answer": false, 197 "justification": "No funding source or acknowledgments section is present in the paper. There is no mention of grants, corporate sponsorship, or funding agencies." 198 }, 199 "affiliations_disclosed": { 200 "applies": true, 201 "answer": true, 202 "justification": "Author affiliations are clearly listed: three authors from Emory University and one from Amazon. This is disclosed prominently at the top of the paper." 203 }, 204 "funder_independent_of_outcome": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding information is disclosed, so independence cannot be assessed. One author is from Amazon, which has commercial interest in RAG systems. Without a funding disclosure, this cannot be evaluated as YES." 208 }, 209 "financial_interests_declared": { 210 "applies": true, 211 "answer": false, 212 "justification": "No competing interests statement or financial interest disclosure is present in the paper. One author is affiliated with Amazon, which has commercial interest in LLM and RAG technologies." 213 } 214 }, 215 "contamination": { 216 "training_cutoff_stated": { 217 "applies": true, 218 "answer": false, 219 "justification": "The paper evaluates GPT-4o-mini and Gemini-2.5-flash on QA benchmarks but does not state the training data cutoff dates for either model. This is necessary to assess whether benchmark questions could appear in training data." 220 }, 221 "train_test_overlap_discussed": { 222 "applies": true, 223 "answer": false, 224 "justification": "The paper does not discuss whether the benchmark questions or their associated Wikipedia passages could have appeared in the training data of GPT-4o-mini or Gemini-2.5-flash. The NOR (no retrieval) baseline shows non-trivial performance (e.g., 32.4% EM on PopQA with Gemini), suggesting possible contamination, but this is not discussed." 225 }, 226 "benchmark_contamination_addressed": { 227 "applies": true, 228 "answer": false, 229 "justification": "The benchmarks used (PopQA 2023, HotpotQA 2018, 2Wiki 2020, MuSiQue 2022) were all published before the likely training cutoffs of GPT-4o-mini and Gemini-2.5-flash. The paper does not address this contamination risk, even though the NOR baseline results suggest the models may have parametric knowledge of the answers." 230 } 231 }, 232 "human_studies": { 233 "pre_registered": { 234 "applies": false, 235 "answer": false, 236 "justification": "No human participants in this study." 237 }, 238 "irb_or_ethics_approval": { 239 "applies": false, 240 "answer": false, 241 "justification": "No human participants in this study." 242 }, 243 "demographics_reported": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "inclusion_exclusion_criteria": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "randomization_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "blinding_described": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "attrition_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 } 268 }, 269 "cost_and_practicality": { 270 "inference_cost_reported": { 271 "applies": true, 272 "answer": true, 273 "justification": "Figures 4, 6, and 7 report token consumption (using a weighted formula: input + 4*output) and wall-clock time for both indexing and retrieval stages across all datasets. Section 5.3 (RQ4) provides detailed analysis of computational costs relative to baselines." 274 }, 275 "compute_budget_stated": { 276 "applies": true, 277 "answer": false, 278 "justification": "While token consumption and time are reported per-stage, the total computational budget (total API spend, total GPU hours for embedding) is not stated. The paper mentions 'a single NVIDIA L40S GPU' for embeddings but does not quantify the total compute used across all experiments." 279 } 280 } 281 }, 282 "claims": [ 283 { 284 "claim": "T2RAG achieves state-of-the-art overall performance on factoid QA, leading in both average EM and F1 scores with Gemini-2.5-flash (51.7 EM, 63.9 F1).", 285 "evidence": "Table 1 shows T2RAG leading in average EM with both LLMs and in average F1 with Gemini-2.5-flash. With GPT-4o-mini, T2RAG leads in EM (47.2) but is second in F1 (60.2 vs HippoRAG2's 61.1).", 286 "supported": "moderate" 287 }, 288 { 289 "claim": "T2RAG outperforms IRCoT by over 7.7% in EM on 2Wiki with Gemini-2.5-flash.", 290 "evidence": "Table 1: T2RAG achieves 69.3 EM vs IRCoT's 61.6 on 2Wiki with Gemini-2.5-flash, a difference of 7.7 percentage points. However, no statistical significance tests are provided.", 291 "supported": "moderate" 292 }, 293 { 294 "claim": "T2RAG reduces retrieval costs by up to 45% compared to multi-round methods.", 295 "evidence": "Figures 4 and 7 show token consumption comparisons. Appendix B.2.2 states 'the reduction in token consumption is over 45%' on Medical and Story datasets. The comparison is specifically against IRCoT during the retrieval stage.", 296 "supported": "strong" 297 }, 298 { 299 "claim": "The iterative triplet resolution module is critical, with removal causing up to 54.5% EM drop on MuSiQue.", 300 "evidence": "Table 2 shows the ablation '- single round' causes EM to drop from 33.0 to 15.0 on MuSiQue (↓54.5%). The effect is smaller on PopQA (↓2.1%) and moderate on 2Wiki (↓22.7%).", 301 "supported": "strong" 302 }, 303 { 304 "claim": "T2RAG shows a powerful synergy with reasoning LLMs, performing significantly higher with Gemini-2.5-flash than GPT-4o-mini.", 305 "evidence": "Table 1 shows T2RAG's average EM is 51.7 with Gemini-2.5-flash vs 47.2 with GPT-4o-mini. However, other methods also generally perform better with Gemini, so the 'synergy' claim is not well-isolated from the general LLM capability difference.", 306 "supported": "weak" 307 } 308 ], 309 "methodology_tags": ["benchmark-eval"], 310 "key_findings": "T2RAG proposes a RAG framework that decomposes queries into atomic triplets with placeholders and iteratively resolves them via retrieval, bypassing explicit graph construction. On six factoid QA benchmarks with two LLMs, T2RAG achieves state-of-the-art average EM scores (51.7 with Gemini-2.5-flash, 47.2 with GPT-4o-mini) while reducing retrieval-stage token costs by up to 45% compared to multi-round baselines. Ablation studies confirm that both iterative resolution and raw chunk augmentation are critical components, with the multi-round mechanism contributing most on multi-hop questions.", 311 "red_flags": [ 312 { 313 "flag": "No statistical significance testing", 314 "detail": "All comparative claims (e.g., 'significantly outperforms') are based on point estimates without any significance tests. With 1,000 questions per dataset, variance across questions could make some differences non-significant." 315 }, 316 { 317 "flag": "No variance or error bars", 318 "detail": "Despite using temperature=0 for determinism, the paper reports single-run results with no assessment of variance from other sources (embedding model initialization, API version changes, data sampling). Reproducibility across runs is asserted but not demonstrated." 319 }, 320 { 321 "flag": "Benchmark contamination risk unaddressed", 322 "detail": "Most benchmarks (HotpotQA 2018, 2Wiki 2020, MuSiQue 2022, PopQA 2023) predate the training of GPT-4o-mini and Gemini-2.5-flash. The NOR baseline shows substantial parametric knowledge (e.g., 48.1% EM on 2Wiki with Gemini), yet contamination is never discussed." 323 }, 324 { 325 "flag": "Amazon affiliation without conflict disclosure", 326 "detail": "One author is affiliated with Amazon, which has commercial interest in RAG systems. No funding, competing interests, or conflict of interest statement is provided." 327 }, 328 { 329 "flag": "Inconsistent leadership across settings", 330 "detail": "T2RAG does not consistently lead: with GPT-4o-mini, it ranks second in average F1 and underperforms on several individual datasets (e.g., HotpotQA EM 54.2 vs Standard RAG's 58.0). The 'state-of-the-art' framing emphasizes the Gemini-2.5-flash results while the GPT-4o-mini results are more mixed." 331 } 332 ], 333 "cited_papers": [ 334 { 335 "title": "From Local to Global: A Graph RAG Approach to Query-Focused Summarization", 336 "authors": ["Darren Edge", "Ha Trinh", "Newman Cheng"], 337 "year": 2024, 338 "arxiv_id": "2404.16130", 339 "relevance": "Foundational GraphRAG system that constructs knowledge graphs with LLMs for retrieval, key baseline paradigm in RAG evaluation." 340 }, 341 { 342 "title": "LightRAG: Simple and Fast Retrieval-Augmented Generation", 343 "authors": ["Zirui Guo", "Lianghao Xia", "Yanhua Yu"], 344 "year": 2024, 345 "arxiv_id": "2410.05779", 346 "relevance": "Dual-level graph+vector RAG system compared in efficiency analysis; represents state-of-the-art Graph RAG efficiency." 347 }, 348 { 349 "title": "From RAG to Memory: Non-Parametric Continual Learning for Large Language Models", 350 "authors": ["Bernal Jiménez Gutiérrez", "Yiheng Shu"], 351 "year": 2025, 352 "arxiv_id": "2502.14802", 353 "relevance": "HippoRAG2 system used as the primary Graph RAG baseline; represents state-of-the-art in graph-based retrieval for multi-hop QA." 354 }, 355 { 356 "title": "Interleaving Retrieval with Chain-of-Thought Reasoning for Knowledge-Intensive Multi-Step Questions", 357 "authors": ["Harsh Trivedi", "Niranjan Balasubramanian"], 358 "year": 2023, 359 "relevance": "IRCoT method used as the primary multi-round RAG baseline; represents the standard for iterative retrieval with chain-of-thought reasoning." 360 }, 361 { 362 "title": "Raptor: Recursive abstractive processing for tree-organized retrieval", 363 "authors": ["Parth Sarthi", "Salman Abdullah"], 364 "year": 2024, 365 "relevance": "Summarization-based RAG baseline that constructs hierarchical summary trees; competitive alternative to graph-based approaches." 366 }, 367 { 368 "title": "React: Synergizing reasoning and acting in language models", 369 "authors": ["Shunyu Yao", "Jeffrey Zhao"], 370 "year": 2023, 371 "relevance": "Foundational agentic framework interleaving reasoning and action in LLMs, relevant to understanding multi-round RAG paradigms." 372 }, 373 { 374 "title": "MiniRAG: Towards Extremely Simple Retrieval-Augmented Generation", 375 "authors": ["Tianyu Fan", "Jingyuan Wang"], 376 "year": 2025, 377 "relevance": "Lightweight Graph RAG approach using heterogeneous graphs, represents resource-constrained RAG system design." 378 }, 379 { 380 "title": "In-depth analysis of graph-based rag in a unified framework", 381 "authors": ["Yingli Zhou", "Yaodong Su"], 382 "year": 2025, 383 "relevance": "Unified benchmark framework for graph-based RAG methods, used for efficiency comparisons (LightRAG, GraphRAG token costs)." 384 }, 385 { 386 "title": "GEAR: Graph-Enhanced Agent for Retrieval-Augmented Generation", 387 "authors": ["Zhili Shen", "Chenxin Diao"], 388 "year": 2024, 389 "relevance": "Alternative approach using triplet search with neighbor expansion for RAG, directly compared in the related work as having entity linking limitations." 390 }, 391 { 392 "title": "Retrieval-augmented generation for large language models: A survey", 393 "authors": ["Yunfan Gao", "Yun Xiong"], 394 "year": 2023, 395 "arxiv_id": "2312.10997", 396 "relevance": "Comprehensive survey of RAG paradigms for LLMs, provides context for the RAG landscape this paper contributes to." 397 } 398 ] 399 }