scan.json (26396B)
1 { 2 "paper": { 3 "title": "Enhancing LLM Factual Accuracy with RAG to Counter Hallucinations: A Case Study on Domain-Specific Queries in Private Knowledge-Bases", 4 "authors": [ 5 "Jiarui Li", 6 "Ye Yuan", 7 "Zehua Zhang" 8 ], 9 "year": 2024, 10 "venue": "arXiv", 11 "arxiv_id": "2403.10446", 12 "doi": "10.48550/arXiv.2403.10446" 13 }, 14 "scan_version": 2, 15 "active_modules": [ 16 "experimental_rigor", 17 "data_leakage" 18 ], 19 "methodology_tags": [ 20 "benchmark-eval", 21 "case-study" 22 ], 23 "key_findings": "A RAG pipeline using LLaMA-2-7B on CMU domain-specific data improves factual accuracy over the baseline (F1 from 0.186 to 0.289, Cosine from 0.504 to 0.577). Fine-tuning the embedding model provides additional gains, but fine-tuning the core LLaMA-2-7B model on a small, skewed dataset paradoxically hurts generation quality (F1 drops from 0.289 to 0.211). The dataset annotation process using WizardLM achieved Cohen's Kappa of 0.67 (substantial agreement) between two human annotators.", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "The abstract states 'Our code and models are available on Github' but no repository URL is provided anywhere in the paper. Without a verifiable link, this cannot be confirmed." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": false, 34 "justification": "The dataset was curated from CMU websites and annotated with WizardLM, but no download link or dataset archive is provided. The paper does not release the 34,781 QA pairs or the crawled documents." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper specifies model names (Llama-2-7b-chat-hf, mxbai-embed-large-v1, bge-reranker-large) and some training parameters, but provides no requirements.txt, Dockerfile, or detailed environment setup listing library versions and dependencies." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "No step-by-step reproduction instructions are provided. The paper describes the system at a high level but lacks specific commands or a README-style guide for replicating the experiments." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": true, 51 "justification": "Table 1 reports standard deviations in parentheses for Recall and F1 across 4 independent runs, e.g., '0.361 (0.069)' for baseline Recall." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "No statistical significance tests are reported. The paper compares configurations by raw numbers in Table 1 without any p-values, t-tests, or other statistical tests to support claims that one configuration outperforms another." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "The paper presents raw metric values in Table 1 but does not report effect sizes or percentage improvements. The reader must compute the magnitude of improvement from the raw numbers." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "The paper uses 128 QA pairs per evaluation run (randomly sampled from 6,957 test pairs) across 4 runs, but provides no justification for these choices and no power analysis." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": true, 71 "justification": "Table 1 reports standard deviation across 4 independent runs: 'Both score and standard deviation are derived from 4 independent runs.'" 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "Table 1 includes a 'Baseline (w/o RAG)' configuration and progressively adds components (RAG pipeline, embedding fine-tuning, core model fine-tuning)." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": false, 83 "justification": "The only baseline is the authors' own system without RAG (raw LLaMA-2). No comparison against other RAG systems, established QA systems, or external baselines is provided." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": true, 88 "justification": "Table 1 presents a clear ablation study showing performance under 5 configurations: baseline, raw RAG, +embedding fine-tuning, +core model fine-tuning, and both fine-tuned." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "Four metrics are used: Recall, F1 Score, Cosine Similarity, and BLEU Score (Table 1, Figure 4)." 94 }, 95 "human_evaluation": { 96 "applies": true, 97 "answer": false, 98 "justification": "No systematic human evaluation of system outputs is conducted. The case study (Section 5.3) shows three qualitative examples but lacks structured human ratings. The Cohen's Kappa evaluation is for dataset annotation quality, not system output quality." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 3.3 states '27,824 pairs are used as training data and 6,957 pairs are used as testing data after random split.' Evaluation samples from the 'human evaluated test set.'" 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": false, 108 "justification": "Only aggregate metrics are reported across all test examples. No breakdown by question type, difficulty, topic, or data source (html vs pdf vs papers)." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 5.3 case study discusses failure modes: the model produces filler tokens ('context:', 'answer:', '<INSTR>'), generates repetitive text, and restates retrieved context rather than paraphrasing. Section 5.2 also notes the fine-tuning degradation." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "Table 1 shows that fine-tuning the core model hurts F1 (0.289→0.211), Cosine (0.577→0.502), and BLEU (0.102→0.056). The paper explicitly discusses this: 'The cumulative effect of finetuning both models produced a drop in F1 score.'" 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "The abstract claims the system is effective at generating more accurate answers (supported by Table 1 improvements) and reveals limitations of fine-tuning with small-scale datasets (supported by the negative fine-tuning results in Table 1 and Section 5.2 discussion)." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "Causal claims like 'finetuning the embedding model yielded improvements' are supported by the ablation design in Table 1, which uses controlled single-variable manipulation across configurations." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": false, 135 "justification": "The title 'Enhancing LLM Factual Accuracy with RAG' implies general applicability, but the study tests only CMU-specific queries with LLaMA-2-7B on a synthetically generated dataset. The abstract frames the work more narrowly as 'domain-specific,' but the title overgeneralizes." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "No discussion of alternative explanations for the results. For example, the fine-tuning degradation could be due to catastrophic forgetting, prompt format mismatch, or dataset quality — the paper speculates about causes but does not systematically consider alternatives." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": false, 145 "justification": "The paper measures Recall, F1, Cosine Similarity, and BLEU and frames these as demonstrating 'factual accuracy,' but does not discuss the gap between these proxy metrics and actual factual accuracy. BLEU and cosine similarity measure lexical/semantic similarity to references, not factual correctness." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": true, 152 "justification": "Specific HuggingFace model IDs are provided: 'meta-llama/Llama-2-7b-chat-hf', 'mixedbread-ai/mxbai-embed-large-v1', 'BAAI/bge-reranker-large', and 'sentence-transformers/all-MiniLM-L6-v2'." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": true, 157 "justification": "Full prompt text is provided in Appendix B: B.1 (dataset generation prompt), B.2 (core model generation prompt), and B.3 (core model fine-tuning prompt)." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": false, 162 "justification": "Training hyperparameters are well-documented (LoRA rank 16, INT4, 5 epochs, 1000 max steps, batch size 8, learning rate 2e-4, embedding 10 epochs). However, inference-time parameters (temperature, top-p, max tokens) are not reported, and these significantly affect output quality." 163 }, 164 "scaffolding_described": { 165 "applies": false, 166 "answer": false, 167 "justification": "No agentic scaffolding is used. The system is a standard RAG pipeline (retriever → reranker → generator) without retry logic, tool use, feedback loops, or agent-like behavior." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 3 describes preprocessing in detail: web crawling with Selenium/BeautifulSoup, BFS depth-2 link exploration, HTML/JavaScript removal, keyword-based filtering (Appendix C), removal of files <200 characters and 'Page_not_found' titles, chunking into 1000-word segments." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": false, 179 "justification": "No dedicated limitations or threats-to-validity section exists. Limitations are mentioned in passing within Sections 5.2 and 6, but there is no substantive dedicated discussion." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": true, 184 "justification": "Specific threats are discussed inline: the dataset is 'possibly small in size and relatively biased' (Section 5.2), fine-tuning 'may reduce the model's performance in language generation' (Section 5.2), and the model's 7B parameter size limits its capacity (Section 5.3)." 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": false, 189 "justification": "No explicit scope boundaries are stated. The paper does not articulate what the results do NOT show or what populations/settings are excluded from the claims." 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": false, 196 "justification": "Neither the crawled web data nor the generated QA pairs are made available for independent verification. No download links or data archives are provided." 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 3 describes data collection in detail: web crawling from CMU websites using Selenium/BeautifulSoup, BFS link exploration, PDF download from CMU sites, research papers fetched via Semantic Scholar API using LTI faculty names, filtered to 2023 open-sourced papers." 202 }, 203 "recruitment_methods_described": { 204 "applies": false, 205 "answer": false, 206 "justification": "No human participants in the study. The two annotators for Cohen's Kappa evaluation appear to be the authors. Data comes from automated web crawling and LLM-generated annotations." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": true, 211 "justification": "Section 3 documents the full pipeline: crawling → HTML/PDF extraction → text extraction → keyword filtering → size filtering → chunking (1000 words) → WizardLM annotation (10 QA pairs per chunk) → producing 34,781 QA pairs → 80/20 train/test split." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding information is mentioned anywhere in the paper. No acknowledgments section with grants or sponsors." 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "All three authors are identified as being from the Information Network Institute, Carnegie Mellon University. Since they evaluate a CMU-specific system and are CMU students, the affiliation is relevant and disclosed." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": false, 227 "answer": false, 228 "justification": "No funding is disclosed; this appears to be an unfunded student project at Carnegie Mellon University." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests or financial interests statement is present in the paper." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": true, 239 "answer": false, 240 "justification": "The paper uses LLaMA-2 but never states its training data cutoff date. CMU web pages could be in LLaMA-2's pretraining data, making this information critical." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": true, 244 "answer": false, 245 "justification": "No discussion of whether LLaMA-2's pretraining data might include CMU web pages that were used to generate the test QA pairs. Also no discussion of overlap between the automatically generated train and test QA pairs from the same source documents." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": true, 249 "answer": false, 250 "justification": "CMU web pages were publicly available before LLaMA-2's training cutoff and could be in its pretraining data. The paper does not address this contamination risk, which would inflate the baseline performance." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the study. The evaluations are entirely automated." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "demographics_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in the study." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in the study." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in the study." 283 }, 284 "attrition_reported": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants in the study." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "No inference cost, latency, or per-query time is reported. The paper does not quantify how long the RAG pipeline takes to answer a query or the computational cost per inference." 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "While training details are given (1000 max steps, batch size 8, 5 epochs), no GPU hours, total training time, hardware specifications, or total compute budget are reported." 300 } 301 }, 302 "experimental_rigor": { 303 "seed_sensitivity_reported": { 304 "applies": true, 305 "answer": false, 306 "justification": "The 4 independent runs use different random samples of 128 QA pairs, not different random seeds for model training or initialization. No seed sensitivity analysis is performed." 307 }, 308 "number_of_runs_stated": { 309 "applies": true, 310 "answer": true, 311 "justification": "Table 1 caption states: 'Both score and standard deviation are derived from 4 independent runs. Each run randomly samples 128 QA pairs from our human evaluated test set.'" 312 }, 313 "hyperparameter_search_budget": { 314 "applies": true, 315 "answer": false, 316 "justification": "No hyperparameter search is described. Fixed hyperparameters are used (LoRA rank 16, learning rate 2e-4, etc.) with no mention of how they were selected or how many configurations were tried." 317 }, 318 "best_config_selection_justified": { 319 "applies": true, 320 "answer": false, 321 "justification": "The ablation study shows all configurations but does not justify which configuration is recommended or how it would be selected. Different configurations win on different metrics (e.g., +Embedding wins on F1/Cosine/BLEU but +Both wins on Recall)." 322 }, 323 "multiple_comparison_correction": { 324 "applies": false, 325 "answer": false, 326 "justification": "No statistical tests are performed, so multiple comparison correction is structurally inapplicable." 327 }, 328 "self_comparison_bias_addressed": { 329 "applies": true, 330 "answer": false, 331 "justification": "The authors evaluate their own system against their own baseline and ablations. No acknowledgment of author-evaluation bias or discussion of independent evaluation." 332 }, 333 "compute_budget_vs_performance": { 334 "applies": true, 335 "answer": false, 336 "justification": "The RAG pipeline and fine-tuned models use more compute than the baseline, but performance is not discussed as a function of compute budget. The core model fine-tuning uses additional compute (LoRA, 1000 steps) but actually hurts several metrics." 337 }, 338 "benchmark_construct_validity": { 339 "applies": true, 340 "answer": false, 341 "justification": "The paper evaluates on a custom QA dataset using Recall, F1, Cosine Similarity, and BLEU, claiming to measure 'factual accuracy.' No discussion of whether these metrics actually capture factual accuracy, or whether the WizardLM-generated QA pairs are valid proxies for real user queries." 342 }, 343 "scaffold_confound_addressed": { 344 "applies": false, 345 "answer": false, 346 "justification": "The RAG pipeline IS the system being evaluated — it is the intervention, not a confound. The ablation study appropriately isolates components within the same pipeline." 347 } 348 }, 349 "data_leakage": { 350 "temporal_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of temporal leakage. CMU web pages and research papers from 2023 were crawled to create the dataset, but the paper does not consider whether LLaMA-2's pretraining data (collected before July 2023) might already include this content." 354 }, 355 "feature_leakage_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of feature leakage. The QA pairs are generated from the same document chunks that serve as the retrieval corpus, creating a potential circularity: the 'correct' answer was generated from the same context that RAG retrieves." 359 }, 360 "non_independence_addressed": { 361 "applies": true, 362 "answer": false, 363 "justification": "No discussion of train/test independence. Training and test QA pairs are randomly split from the same pool of 34,781 WizardLM-generated pairs from the same document corpus. QA pairs from the same document chunk could appear in both splits." 364 }, 365 "leakage_detection_method": { 366 "applies": true, 367 "answer": false, 368 "justification": "No leakage detection or prevention method is used. No decontamination, membership inference, temporal splits, or independence verification is performed." 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "RAG improves factual accuracy of LLaMA-2 for CMU domain-specific queries across all metrics", 375 "evidence": "Table 1 shows improvement from baseline to RAG pipeline: Recall 0.361→0.409, F1 0.186→0.289, Cosine 0.504→0.577, BLEU 0.043→0.102 (Section 5.2).", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Fine-tuning the embedding model improves retrieval and downstream QA performance", 380 "evidence": "Table 1 shows improvement from raw RAG to +Embedding: Recall 0.409→0.437, F1 0.289→0.304, Cosine 0.577→0.597, BLEU 0.102→0.108 (Section 5.2).", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Fine-tuning LLaMA-2-7B on this small dataset hurts generation quality despite improving recall", 385 "evidence": "Table 1 shows core model fine-tuning improves Recall (0.409→0.448) but degrades F1 (0.289→0.211), Cosine (0.577→0.502), and BLEU (0.102→0.056). The paper attributes this to small/biased dataset and prompt format mismatch (Section 5.2).", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "The automated annotation process achieves substantial inter-annotator agreement (Cohen's κ = 0.67)", 390 "evidence": "Section 3.4 reports two annotators independently classified a subset, achieving κ = 0.67 (83.33% agreement).", 391 "supported": "moderate" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "No external baselines", 397 "detail": "The paper only compares against its own system variants (ablations). No comparison against other RAG systems, established QA baselines, or concurrent work." 398 }, 399 { 400 "flag": "Circular evaluation design", 401 "detail": "QA pairs for both training and testing are generated by WizardLM from the same document corpus that serves as the RAG retrieval source. This creates a circularity: the 'correct' answers were generated from the same context that the system retrieves, inflating metrics." 402 }, 403 { 404 "flag": "No significance tests for comparative claims", 405 "detail": "Despite reporting standard deviations showing overlapping ranges between configurations (e.g., Recall 0.437±0.076 vs 0.448±0.106), no statistical tests are used to determine whether differences are significant." 406 }, 407 { 408 "flag": "Code URL missing", 409 "detail": "The abstract claims 'Our code and models are available on Github' but provides no repository URL, making verification impossible." 410 }, 411 { 412 "flag": "Tiny evaluation sample", 413 "detail": "Only 128 QA pairs per run are sampled from 6,957 test pairs (1.8% of the test set), with no justification for this small sample size." 414 }, 415 { 416 "flag": "LLM-generated ground truth", 417 "detail": "The 'ground truth' QA pairs are generated by WizardLM, not by human experts. Cohen's Kappa (0.67) measures agreement between two annotators on the automated outputs, not the correctness of the answers themselves." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "Language Models are Few-Shot Learners", 423 "authors": ["Tom B. Brown", "Benjamin Mann", "Nick Ryder"], 424 "year": 2020, 425 "relevance": "GPT-3 paper establishing few-shot learning capabilities of large language models." 426 }, 427 { 428 "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models", 429 "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"], 430 "year": 2023, 431 "relevance": "Core model used in this study; foundational open-source LLM for code and QA research." 432 }, 433 { 434 "title": "Retrieval-Augmented Generation for Large Language Models: A Survey", 435 "authors": ["Yunfan Gao", "Yun Xiong", "Xinyu Gao"], 436 "year": 2023, 437 "relevance": "Survey of RAG techniques for improving LLM factual accuracy, directly relevant to methodology evaluation." 438 }, 439 { 440 "title": "A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions", 441 "authors": ["Lei Huang", "Weijiang Yu", "Weitao Ma"], 442 "year": 2023, 443 "relevance": "Taxonomy of LLM hallucination types and mitigation strategies." 444 }, 445 { 446 "title": "WizardLM: Empowering Large Language Models to Follow Complex Instructions", 447 "authors": ["Can Xu", "Qingfeng Sun", "Kai Zheng"], 448 "year": 2023, 449 "relevance": "Model used as the annotation teacher in this study; relevant to LLM-as-annotator methodology." 450 }, 451 { 452 "title": "A Survey on In-Context Learning", 453 "authors": ["Qingxiu Dong", "Lei Li", "Damai Dai"], 454 "year": 2022, 455 "relevance": "Survey of in-context learning methods for LLMs, relevant to prompting and few-shot evaluation methodology." 456 }, 457 { 458 "title": "StarCoder: May the Source Be with You!", 459 "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"], 460 "year": 2023, 461 "relevance": "Code-specialized LLM relevant to understanding parametric knowledge storage in language models." 462 }, 463 { 464 "title": "On the Dangers of Stochastic Parrots: Can Language Models Be Too Big?", 465 "authors": ["Emily M Bender", "Timnit Gebru", "Angelina McMillan-Major"], 466 "year": 2021, 467 "relevance": "Foundational critique of LLM risks and limitations, cited in context of hallucination causes." 468 }, 469 { 470 "title": "HaluEval-Wild: Evaluating Hallucinations of Language Models in the Wild", 471 "authors": ["Zhiying Zhu", "Zhiqing Sun", "Yiming Yang"], 472 "year": 2024, 473 "relevance": "Hallucination evaluation benchmark for LLMs in real-world settings." 474 } 475 ] 476 }