scan.json (26468B)
1 { 2 "paper": { 3 "title": "Classifying and Addressing the Diversity of Errors in Retrieval-Augmented Generation Systems", 4 "authors": [ 5 "Kin Kwan Leung", 6 "Mouloud Belbahri", 7 "Yi Sui", 8 "Alex Labach", 9 "Xueying Zhang", 10 "Stephen Anthony Rose", 11 "Jesse C. Cresswell" 12 ], 13 "year": 2025, 14 "venue": "arXiv", 15 "arxiv_id": "2510.13975" 16 }, 17 "checklist": { 18 "artifacts": { 19 "code_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "The paper provides a GitHub link at https://github.com/layer6ai-labs/rag-error-classification and states 'Code and data are available' in the abstract. The annotated dataset is also released via this repository." 23 }, 24 "data_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The paper states that the annotated error dataset is released at the GitHub repository. The DragonBall and CLAPnq datasets used are public datasets (CC BY-NC-SA 4.0 and Apache 2.0 respectively). The curated 406-annotation dataset is shared." 28 }, 29 "environment_specified": { 30 "applies": true, 31 "answer": false, 32 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is mentioned in the paper. While specific model names are listed (e.g., gte-large-en-v1.5, rank-zephyr-7b-v1-full, Meta-Llama-3-8B-Instruct), there are no library versions or environment setup details." 33 }, 34 "reproduction_instructions": { 35 "applies": true, 36 "answer": false, 37 "justification": "The paper does not include step-by-step reproduction instructions. While a GitHub repository is referenced and hyperparameters are listed in Table 5 and the appendix, there is no explicit 'Reproducing Results' section or README-level guidance described in the paper itself." 38 } 39 }, 40 "statistical_methodology": { 41 "confidence_intervals_or_error_bars": { 42 "applies": true, 43 "answer": false, 44 "justification": "The paper reports point estimates for agreement rates (92.9%, 57.8%, 40.3%) and error distributions without confidence intervals or error bars." 45 }, 46 "significance_tests": { 47 "applies": true, 48 "answer": true, 49 "justification": "The paper mentions 'both datasets exhibiting significant chi-squared test results' in Section C.3 when analyzing co-occurrence patterns of error types." 50 }, 51 "effect_sizes_reported": { 52 "applies": true, 53 "answer": true, 54 "justification": "The paper reports the improvement from the original pipeline (73.3% correct) to the improved pipeline (82.8% correct) in Section 5, and provides full error count breakdowns with absolute numbers and percentages in Tables 2-3, giving sufficient context for the magnitude of effects." 55 }, 56 "sample_size_justified": { 57 "applies": true, 58 "answer": false, 59 "justification": "No justification is given for why 406 examples were manually annotated out of 832 errors, nor is there a power analysis or discussion of whether this sample size is sufficient for the classification agreement analysis." 60 }, 61 "variance_reported": { 62 "applies": true, 63 "answer": false, 64 "justification": "The RAGEC system queries the LLM K=10 times per datapoint and reports mode frequency distributions (Table 4), but no standard deviation, IQR, or other spread measure is reported for the main results (agreement rates, classification accuracy)." 65 } 66 }, 67 "evaluation_design": { 68 "baselines_included": { 69 "applies": true, 70 "answer": true, 71 "justification": "Section B.4 describes two alternative auto-evaluation approaches: single-step error type classification (41.1%/31.1% agreement) and stage-sequential error type classification (47.5%/36.3% agreement), compared to RAGEC (57.8%/40.3%)." 72 }, 73 "baselines_contemporary": { 74 "applies": true, 75 "answer": false, 76 "justification": "While RAGEC is compared against alternative internal approaches, it is not compared against existing RAG evaluation tools like RAGChecker, ARES, or RAGAs mentioned in the Related Work section. The baselines are only the authors' own design alternatives." 77 }, 78 "ablation_study": { 79 "applies": true, 80 "answer": true, 81 "justification": "The paper effectively performs ablation by testing three different design approaches (single-step, stage-sequential, and the proposed backward-cascading RAGEC), each progressively adding design features. This demonstrates which components matter." 82 }, 83 "multiple_metrics": { 84 "applies": true, 85 "answer": true, 86 "justification": "The paper reports stage classification agreement (57.8%), error type classification accuracy (40.3%), answer evaluation agreement (92.9%), and mode frequency consistency (Table 4). Multiple metrics are used across different evaluation aspects." 87 }, 88 "human_evaluation": { 89 "applies": true, 90 "answer": true, 91 "justification": "406 erroneous responses were manually annotated by the authors for stage and error type. Human annotations serve as ground truth for evaluating RAGEC. Section 5 describes the annotation process and compares RAGEC against human labels." 92 }, 93 "held_out_test_set": { 94 "applies": true, 95 "answer": false, 96 "justification": "There is no separation between development and test sets for the human-annotated data. The 406 annotations appear to have been used both to develop and evaluate RAGEC, with no held-out portion mentioned." 97 }, 98 "per_category_breakdown": { 99 "applies": true, 100 "answer": true, 101 "justification": "Table 3 provides detailed per-domain (finance, law, medicine) and per-query-type (factual, unanswerable, numerical comparison, etc.) breakdowns. Tables 2 and 6 show per-error-type distributions for DragonBall-EN and CLAPnq respectively." 102 }, 103 "failure_cases_discussed": { 104 "applies": true, 105 "answer": true, 106 "justification": "The paper extensively discusses failure modes of both the RAG pipeline (16 error types with concrete examples in Section 4) and of the RAGEC auto-evaluation system itself (e.g., RAGEC over-blaming the generator vs. humans favoring retrieval, Table 1)." 107 }, 108 "negative_results_reported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The paper honestly reports that RAGEC achieves only 57.8% stage classification and 40.3% error-type classification agreement, stating these are 'far from perfect.' Alternative methods that performed worse are also reported in Section B.4." 112 } 113 }, 114 "claims_and_evidence": { 115 "abstract_claims_supported": { 116 "applies": true, 117 "answer": true, 118 "justification": "The abstract claims a new taxonomy, an auto-evaluation method, and a curated annotated dataset — all of which are presented and validated in the paper. No performance claims are overstated." 119 }, 120 "causal_claims_justified": { 121 "applies": true, 122 "answer": true, 123 "justification": "The paper makes causal claims about the RAG pipeline improvements in Section 5 ('Using RAGEC to Improve RAG'), where specific changes (recursive chunking, increased k) reduced errors from 832 to 534. The controlled manipulation of pipeline components provides adequate justification for these claims." 124 }, 125 "generalization_bounded": { 126 "applies": true, 127 "answer": true, 128 "justification": "The Limitations section explicitly states that the taxonomy is 'not exhaustive of all possible errors,' that the pipeline was 'not highly tuned' for the datasets, and that the study 'focuses on single-turn textual queries.' The scope is appropriately bounded to the tested configurations." 129 }, 130 "alternative_explanations_discussed": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper discusses how dataset characteristics affect error distributions (e.g., CLAPnq has no chunking errors because it is pre-chunked), acknowledges that RAGEC over-blames the generator relative to humans, and explores why stage-sequential classification failed at chunking detection, providing substantive alternative explanations." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper specifies exact model names: 'Meta-Llama-3-8B-Instruct' for generation, 'gte-large-en-v1.5' for embeddings, 'rank-zephyr-7b-v1-full' for reranking (Table 5 and Section A). GPT-4o and GPT-4o-mini are named for auto-evaluation. These are specific enough to identify exact models on Hugging Face, though API snapshot dates for GPT-4o/GPT-4o-mini are not provided." 141 }, 142 "prompts_provided": { 143 "applies": true, 144 "answer": true, 145 "justification": "Full system prompts are provided in Listings 1-10 in the appendix for the reranker, generator, answer evaluation, stage classification, and all error type classification stages. Actual prompt templates with placeholders are shown with enough context to reconstruct inputs." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": true, 150 "justification": "Table 5 reports key hyperparameters: chunk size (128 tokens), overlap (25 tokens), top-k for retrieval (8 and 5), top-k' for reranking (5 and 3). The K=10 repetitions for classification consistency are also specified. However, LLM temperature/sampling settings for GPT-4o are not stated." 151 }, 152 "scaffolding_described": { 153 "applies": true, 154 "answer": true, 155 "justification": "The full RAG pipeline architecture is described in Section 3 and Figure 1, with each stage (chunking, embedding, retrieval, reranking, generation) detailed. The RAGEC auto-evaluation pipeline is described step-by-step in Section 5 and Appendix B." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 3 and Appendix A describe the chunking strategy (fixed-length with 128 tokens, 25-token overlap for DragonBall; pre-chunked for CLAPnq), embedding procedure, and the datasets used. The DragonBall-CN uses 512 characters instead of 128 tokens." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": true, 167 "justification": "A dedicated 'Limitations' section appears after Section 6 (Conclusion), discussing non-exhaustiveness of the taxonomy, pipeline tuning limitations, single-turn focus, and the challenges of RAG error classification." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": true, 172 "justification": "The Limitations section discusses specific threats: the taxonomy may miss errors from additional pipeline stages, the pipeline was not tuned for the datasets, evaluation is limited to single-turn textual queries, and annotation was done only by the authors (not crowdsourced), which could introduce bias." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": true, 177 "justification": "The paper explicitly states: 'Our taxonomy focuses on errors that occur within RAG systems, and hence we exclude failures caused by adversarial inputs, faults in the corpus, or similar anomalies' (Section 4). The Limitations section adds that multi-turn and multimodal inputs are out of scope." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": true, 184 "justification": "The annotated error dataset is released at the GitHub repository. The underlying DragonBall and CLAPnq datasets are publicly available. This allows independent verification of the annotation and classification results." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section 5 describes the data curation process: 406 erroneous responses from DragonBall-EN were manually annotated for stage and error type. Annotators used all available context including ground truth, retrieved/reranked chunks, and the corpus." 190 }, 191 "recruitment_methods_described": { 192 "applies": true, 193 "answer": false, 194 "justification": "The Limitations section mentions 'all annotation was done manually by the authors' but does not describe how annotators were selected from among the authors, whether all authors annotated, or whether there was any inter-annotator agreement measurement. For a paper proposing ground-truth annotations, this is a gap." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "The pipeline is documented: 3108 EN + 3601 CN queries from DragonBall, 4946 from CLAPnq; 27% (25%) of responses classified as incorrect yielding 832 (1222) errors; 406 manually annotated from DragonBall-EN; 377 confirmed as erroneous (92.9% agreement). Each step is traceable." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": false, 206 "justification": "No funding source or acknowledgments section is present in the paper. All authors are affiliated with Layer 6 AI, a commercial entity, but no funding disclosure is made." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Author affiliations are clearly listed: all seven authors are from 'Layer 6 AI, Toronto, Canada' with email addresses provided." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": false, 216 "justification": "Layer 6 AI is a commercial AI company (acquired by TD Bank). While no explicit funding is disclosed, the work is conducted by a corporate lab that has a financial interest in RAG system development and deployment. The funder's independence cannot be established." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests or financial interests statement is present in the paper. Layer 6 AI is a commercial entity working on applied AI, which could benefit from the tools and frameworks described." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": true, 227 "answer": false, 228 "justification": "The paper uses Meta-Llama-3-8B-Instruct for generation and GPT-4o/GPT-4o-mini for evaluation but does not state the training data cutoff dates for any of these models. This is relevant because the benchmarks used (DragonBall, CLAPnq) could potentially overlap with training data." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": true, 232 "answer": false, 233 "justification": "No discussion of whether the DragonBall or CLAPnq datasets (or their source documents from Wikipedia for CLAPnq) may have been seen during Llama-3's pre-training. The DragonBall dataset uses domain-specific synthetic documents that may be less susceptible, but this is not discussed." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": true, 237 "answer": false, 238 "justification": "CLAPnq is based on Natural Questions which uses Wikipedia articles — likely present in Llama-3's training data. The paper does not discuss this contamination risk, though the RAG setting partially mitigates it since answers should come from retrieved context." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "This is not a human subjects study. The 'human annotations' are expert annotations by the authors themselves to create ground truth labels, not a study of human participants." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants were studied. The annotation was done by the paper's authors as part of the research methodology." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants were studied. Author-annotators are not subjects." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants were studied." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants were studied; this is not an experimental study with conditions." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants were studied." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants were studied." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": false, 282 "justification": "No inference cost, API cost, or latency information is reported. RAGEC calls GPT-4o-mini K=10 times per datapoint across multiple stages, which has a non-trivial cost, but this is not quantified." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": false, 287 "justification": "No total compute budget, GPU hours, or API spend is reported. Running Llama-3-8B-Instruct for generation on thousands of queries, plus GPT-4o/GPT-4o-mini calls for evaluation, would represent significant compute, but this is not disclosed." 288 } 289 } 290 }, 291 "claims": [ 292 { 293 "claim": "RAGEC achieves 92.9% agreement with human annotations for answer evaluation (determining if a RAG response is incorrect).", 294 "evidence": "Section 5 reports that 406 queries were manually annotated and 377 were confirmed as erroneous, yielding a 92.9% agreement rate with RAGEC's answer evaluation step.", 295 "supported": "strong" 296 }, 297 { 298 "claim": "RAGEC achieves 57.8% stage classification agreement and 40.3% error-type classification accuracy compared to human annotations.", 299 "evidence": "Section 5 and Table 1 show the stage classification confusion matrix. The paper honestly acknowledges these rates are 'not close to perfect' but better than alternatives tested.", 300 "supported": "strong" 301 }, 302 { 303 "claim": "Targeted improvements guided by RAGEC reduced errors from 832 (73.3% correct) to 534 (82.8% correct) on DragonBall-EN.", 304 "evidence": "Section 5 ('Using RAGEC to Improve RAG') describes changing from fixed-length to recursive sentence-level chunking and increasing retrieved chunks, resulting in the improvement shown in Table 2 column 'Impr.'", 305 "supported": "moderate" 306 }, 307 { 308 "claim": "Fabricated Content (hallucination) is relatively rare compared to other error types in RAG systems.", 309 "evidence": "Section 5 and Table 2 show E10 (Fabricated Content) accounts for only 34 out of 832 modal errors in DragonBall-EN, while chunking, retrieval, and misinterpretation errors dominate. This is discussed as contrasting with 'the focus on hallucination in recent research.'", 310 "supported": "moderate" 311 }, 312 { 313 "claim": "The proposed taxonomy covers 16 error types across 4 RAG pipeline stages, more comprehensive than prior work.", 314 "evidence": "Section 2 compares to Barnett et al. (2024) with 7 types. Section 4 describes all 16 types with examples. The taxonomy covers chunking (3), retrieval (3), reranking (2), and generation (8) stages.", 315 "supported": "strong" 316 }, 317 { 318 "claim": "RAGEC outperforms single-step prompting (41.1%/31.1%) and stage-sequential classification (47.5%/36.3%) for error classification.", 319 "evidence": "Appendix B.4 describes both alternative approaches and their agreement rates compared to RAGEC's 57.8%/40.3%.", 320 "supported": "strong" 321 } 322 ], 323 "methodology_tags": [ 324 "benchmark-eval", 325 "case-study" 326 ], 327 "key_findings": "The paper proposes a 16-type error taxonomy for RAG systems spanning chunking, retrieval, reranking, and generation stages, finding that hallucination (fabricated content) is relatively rare compared to retrieval and chunking errors. The RAGEC auto-evaluation system achieves 92.9% agreement for answer evaluation but only 57.8% for stage classification and 40.3% for error-type classification, indicating RAG error attribution remains a challenging problem. A case study demonstrates that RAGEC-guided improvements (sentence-level chunking and increased retrieval) reduced errors by 35.8% on DragonBall-EN, from 832 to 534 errors.", 328 "red_flags": [ 329 { 330 "flag": "Low auto-evaluation accuracy presented as useful", 331 "detail": "The RAGEC system achieves only 40.3% error-type classification accuracy and 57.8% stage classification agreement, yet the paper frames this as a useful diagnostic tool. While the paper is honest about these numbers, the practical utility of a system that disagrees with human annotators more than it agrees is questionable." 332 }, 333 { 334 "flag": "Author-only annotations without inter-annotator agreement", 335 "detail": "All 406 annotations were done by the authors themselves. No inter-annotator agreement (e.g., Cohen's kappa) is reported, no external annotators were used, and it is unclear how many authors annotated each example or whether annotations were reviewed independently." 336 }, 337 { 338 "flag": "No contamination analysis for benchmark datasets", 339 "detail": "CLAPnq uses Wikipedia-based Natural Questions which are almost certainly in Llama-3's training data. While RAG partially mitigates this by providing retrieved context, the paper does not discuss how parametric knowledge contamination may affect the error distribution (e.g., E11 Parametric Overreliance rates)." 340 }, 341 { 342 "flag": "No cost or compute reporting", 343 "detail": "The system involves running Llama-3-8B on thousands of queries plus GPT-4o/GPT-4o-mini calls (K=10 per error), but no cost or compute budget is reported. This makes it difficult to assess the practicality of RAGEC for practitioners." 344 } 345 ], 346 "cited_papers": [ 347 { 348 "title": "Seven failure points when engineering a retrieval augmented generation system", 349 "authors": ["Scott Barnett", "Stefanus Kurniawan", "Srikanth Thudumu", "Zach Brannelly", "Mohamed Abdelrazek"], 350 "year": 2024, 351 "relevance": "Most comparable prior work on RAG error taxonomy with seven failure types, directly compared to the proposed 16-type taxonomy." 352 }, 353 { 354 "title": "RAGChecker: A Fine-grained Framework for Diagnosing Retrieval-Augmented Generation", 355 "authors": ["Dongyu Ru"], 356 "year": 2024, 357 "relevance": "RAG evaluation framework that diagnoses retrieval-augmented generation at a fine-grained level, closely related to RAGEC's goals." 358 }, 359 { 360 "title": "ARES: An Automated Evaluation Framework for Retrieval-Augmented Generation Systems", 361 "authors": ["Jon Saad-Falcon", "Omar Khattab", "Christopher Potts", "Matei Zaharia"], 362 "year": 2024, 363 "relevance": "Automated RAG evaluation framework using LLM-as-a-Judge, a key prior work in the auto-evaluation space this paper contributes to." 364 }, 365 { 366 "title": "RAGAs: Automated evaluation of retrieval augmented generation", 367 "authors": ["Shahul Es", "Jithin James", "Luis Espinosa Anke", "Steven Schockaert"], 368 "year": 2024, 369 "relevance": "RAG evaluation metrics framework using LLM-based evaluation, one of the main existing tools for RAG quality assessment." 370 }, 371 { 372 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 373 "authors": ["Lianmin Zheng"], 374 "year": 2023, 375 "relevance": "Foundational work on LLM-as-a-Judge methodology used throughout the RAGEC system for error classification." 376 }, 377 { 378 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 379 "authors": ["Patrick Lewis"], 380 "year": 2020, 381 "relevance": "Original RAG paper that established the retrieval-augmented generation paradigm evaluated in this work." 382 }, 383 { 384 "title": "Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection", 385 "authors": ["Akari Asai"], 386 "year": 2023, 387 "relevance": "Self-reflective RAG approach relevant to generation-stage error mitigation strategies discussed in the paper." 388 }, 389 { 390 "title": "Chain-of-verification reduces hallucination in large language models", 391 "authors": ["Shehzaad Dhuliawala"], 392 "year": 2024, 393 "relevance": "Post-generation validation technique discussed as a mitigation for fabricated content errors in RAG systems." 394 }, 395 { 396 "title": "Search Engines in an AI Era: The False Promise of Factual and Verifiable Source-Cited Responses", 397 "authors": ["Pranav Narayanan Venkit"], 398 "year": 2024, 399 "relevance": "Classifies errors in public QA systems from human evaluator perspectives, a key comparison point for the proposed taxonomy." 400 }, 401 { 402 "title": "Hallucination-Free? Assessing the Reliability of Leading AI Legal Research Tools", 403 "authors": ["Varun Magesh"], 404 "year": 2025, 405 "relevance": "Assesses reliability of AI legal research tools with focus on hallucination, relevant to RAG error analysis in domain-specific settings." 406 }, 407 { 408 "title": "A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions", 409 "authors": ["Lei Huang"], 410 "year": 2025, 411 "relevance": "Comprehensive survey on LLM hallucination taxonomy that this paper contrasts by showing hallucination is only one of many RAG error types." 412 }, 413 { 414 "title": "Self-consistency improves chain of thought reasoning in language models", 415 "authors": ["Xuezhi Wang"], 416 "year": 2023, 417 "relevance": "Self-consistency technique used in RAGEC's K=10 repeated classification approach to improve reliability of LLM-based error classification." 418 } 419 ] 420 }