scan.json (24948B)
1 { 2 "paper": { 3 "title": "Citation-Enhanced Generation for LLM-based Chatbots", 4 "authors": ["Weitao Li", "Junkai Li", "Weizhi Ma", "Yang Liu"], 5 "year": 2024, 6 "venue": "ACL 2024 Main Conference", 7 "arxiv_id": "2402.16063" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "The abstract states: 'Our code and datasets can be found at https://github.com/Tsinghua-dhy/CEG.' A working URL is provided." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses publicly available benchmarks (WikiBio GPT-3, FELM, HaluEval) and releases the newly constructed WikiRetr datasets at the GitHub repository mentioned in the abstract." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is mentioned in the paper. The paper does not specify library versions or dependencies beyond naming the tools used (NLTK, SimCSE BERT)." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided in the paper. While code is released and prompts are listed in the appendix, there is no 'Reproducing Results' section or README-level instructions described in the paper itself." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "All results are reported as point estimates (e.g., '77.59' balanced accuracy, '69.45%' accuracy) without confidence intervals, error bars, or any uncertainty quantification." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims CEG 'outperforms' baselines based solely on comparing numbers in tables. No statistical significance tests (p-values, t-tests, etc.) are reported for any comparison." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports improvements with baseline context, e.g., '8.10% improvements compared to the pre-hoc retrieval strategy' (Section 5.2), and tables show both CEG and baseline numbers enabling magnitude assessment." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification is given for sample sizes. For HaluEval, '2,000 samples of which are randomly sampled' but no rationale for why 2,000. For WikiRetr, 1,000 passages are selected and 100 are manually annotated, with no power analysis or justification." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No variance, standard deviation, or spread measures are reported. The paper sets temperature to 0 for reproducibility but does not report variance across any experimental runs." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Multiple baselines are included for each benchmark: HalluDetector, Focus, SelfCheckGPT variants for WikiBio GPT-3; Vanilla, CoT, Link, Doc for FELM; Vanilla, CoT, Pre-Retrieval for HaluEval." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines are contemporary and represent the state of the art at time of writing: SelfCheckGPT (2023), Focus (2023), HalluDetector (2023), and FELM baselines from Chen et al. (2023)." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Section 5.3.1 presents ablation experiments on FELM, removing the retrieval augmentation module (w/o RA) and the document selection threshold (w/o Threshold), showing each component's contribution (Table 4)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Multiple metrics are used: AUC-PR and Balanced_Accuracy for WikiBio GPT-3; Nonfactual accuracy, Factual accuracy, and Balanced accuracy for FELM; Accuracy for HaluEval; Recall@k and Precision@k for WikiRetr." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "Human annotation is used to establish ground truth for the WikiRetr datasets: three annotators manually assess whether original passages support rewritten claims (Section 4.5, Appendix D). Agreement rates with human annotators are reported in Table 5." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper uses established benchmark test sets (WikiBio GPT-3, FELM WorldKnowledge subset, HaluEval QA subset) that are standard evaluation splits from prior work." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down by category: Nonfactual vs. Factual accuracy in Tables 1, 2, and 4. WikiRetr results broken down by GPT-3 vs. GPT-4 variants. Results per backbone LLM (Vicuna-33B, ChatGPT, GPT-4) in Table 2." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper discusses failure cases: Vicuna-33B shows 'some degree of decline' indicating 'limitations in general ability' (Section 5.1). Case studies in Appendix C show examples including incorrect initial reasoning that is corrected via regeneration." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Several negative results are reported: CEG performs slightly worse than SelfCheckGPT w/NLI on nonfactual AUC-PR (Section 5.1); pre-hoc retrieval performs worse than CoT baseline on HaluEval (Section 5.2); Vicuna-33B performs worse with CEG than vanilla (Table 2)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims the framework 'outperforms state-of-the-art methods in both hallucination detection and response regeneration on three benchmarks.' Tables 1-3 support this: CEG achieves best balanced accuracy on WikiBio GPT-3 and FELM with GPT-4, and best accuracy on HaluEval." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims via ablation (removing RA module, removing threshold), which constitutes controlled single-variable manipulation (Table 4). The ablation design is adequate for supporting claims that individual components contribute to performance." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper claims the method is 'capable of various LLMs' and has 'significant practical implications' but only tests on GPT models and Vicuna-33B, all on Wikipedia-based knowledge QA. The title 'LLM-based Chatbots' is broader than what is tested. The Limitations section partially addresses this but the abstract and introduction overclaim generality." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not substantively discuss alternative explanations for its results. For example, it does not consider whether improvements come from the additional compute/API calls rather than the specific architecture, or whether the regeneration module simply benefits from a second attempt regardless of citation feedback." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Section 4.1 specifies: 'ChatGPT refers to GPT-3.5-Turbo-1106, and GPT-4 refers to GPT-4-0613.' Table 11 also mentions GPT-4-1106-preview. Vicuna-33B is named. These are specific versioned model identifiers." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Full prompts are provided in Appendix A (Tables 12-17), including the NLI prompt for WikiBio, the CEG evaluation prompt for FELM, baseline prompts for HaluEval, and the regeneration prompt. These are actual prompt texts with placeholders clearly marked." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section 4.1 states 'We set the decoding temperature as 0.' Section 5.3 reports k values (4-6), threshold (0.5), and k sensitivity analysis in Figure 4. These are the key hyperparameters for the system." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The multi-module pipeline is described in detail in Section 3: retrieval augmentation module (Section 3.2), citation generation module (Section 3.3), and response regeneration module (Section 3.4), with workflow diagrams (Figure 2) and the iterative regeneration loop." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Data preprocessing is documented: Wikipedia corpus segmented into ~100-word documents (Section 3.2), NLTK sentence tokenizer used for claim segmentation (Section 3.2), WikiRetr construction process described (Section 4.5), and dataset statistics provided in Appendix B." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "A dedicated 'Limitations' section is present after the Conclusion, listing four specific limitations covering restricted retriever/corpus, lack of new QA datasets, NLI method reliance on LLM knowledge, and API costs." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "The Limitations section discusses specific threats: restricted to Wikipedia corpus limiting applicability to 'general knowledge-based question-answering scenarios,' reliance on existing benchmarks for regeneration evaluation, and NLI module dependence on LLM world knowledge." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "While the Limitations section mentions the Wikipedia corpus restriction, the paper does not explicitly state what the results do NOT show. It does not bound claims to specific model families, languages, or domains in a systematic way. The broad framing ('LLM-based Chatbots') is not reconciled with the narrow evaluation scope." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The underlying benchmarks (WikiBio GPT-3, FELM, HaluEval) are publicly available. The newly constructed WikiRetr datasets are released at the GitHub repository. Manual annotations for WikiRetr are described (Appendix D)." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Data collection is described for WikiRetr: 1,000 passages randomly selected from Wikipedia, rewritten by text-davinci-003 and GPT-4 (Section 4.5). For existing benchmarks, their construction is summarized (Sections 4.2-4.4)." 182 }, 183 "recruitment_methods_described": { 184 "applies": true, 185 "answer": false, 186 "justification": "The paper uses three human annotators for WikiRetr dataset validation but does not describe how annotators were recruited, their qualifications, or potential selection biases." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The data pipeline is documented: Wikipedia snapshot (Oct 20, 2023) segmented into ~100-word documents, passages randomly selected, rewritten by LLMs, then manually annotated by three annotators with consensus resolution for disagreements (Section 4.5, Appendix D)." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "The Acknowledgement section states: 'This work is supported by the National Natural Science Foundation of China (No. 62276152, 61925601, 62372260).'" 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed: Tsinghua University (Dept. of Computer Science, Institute for AI, AIR). No evaluated product is affiliated with the authors' institution." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "The funder is the National Natural Science Foundation of China, a government agency with no financial stake in whether the CEG framework outperforms baselines." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement is present in the paper. Absence of disclosure is not the same as absence of conflict." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper uses GPT-3.5 and GPT-4 models to evaluate on benchmarks but does not state the training data cutoff dates for these models. The WikiBio GPT-3 dataset was generated by text-davinci-003, and benchmark data could overlap with model training data." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of potential train/test overlap. The benchmarks (WikiBio, FELM, HaluEval) are based on Wikipedia content that is likely in the training data of the GPT models used for NLI evaluation." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "No discussion of benchmark contamination. WikiBio GPT-3 (2023), FELM (2023), and HaluEval (2023) were published before GPT-4's training cutoff, and the paper does not address whether these benchmarks or their source Wikipedia content appeared in training data." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants study. The three annotators perform dataset validation, not a human subjects experiment." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants study requiring IRB approval. Annotators are performing a labeling task, not serving as experimental subjects." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants study. The annotators are performing dataset validation, not participating in a study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants study requiring randomization." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants study requiring blinding." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Table 11 reports API call counts for different GPT models across all experiments. The Limitations section acknowledges 'Prompting to regenerate and Using NLI technology to generate citations both incur API cost.' While not in dollar amounts, the number of API calls is reported." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": true, 279 "justification": "Table 11 provides a detailed breakdown of API calls per experiment and per model version (GPT-3.5 and GPT-4), giving a clear picture of the computational budget. For example, ~7700 GPT-3.5 calls for HaluEval, ~20000 GPT-4 calls for NLI experiments on WikiRetr." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "CEG outperforms state-of-the-art methods in hallucination detection on WikiBio GPT-3, achieving 77.59% balanced accuracy and 70.24% factual AUC-PR.", 286 "evidence": "Table 1 shows CEG achieves the best Balanced_Acc (77.59%) and Factual AUC-PR (70.24%), surpassing SelfCheckGPT w/Prompt (72.64% balanced, 68.37% factual).", 287 "supported": "strong" 288 }, 289 { 290 "claim": "CEG with GPT-4 achieves the best balanced accuracy (69.9%) on the FELM WorldKnowledge subset.", 291 "evidence": "Table 2 shows CEG with GPT-4 achieves 69.9% balanced accuracy, compared to 68.4% for CoT with GPT-4 and 64.6% for Doc with GPT-4.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "CEG achieves 8.10% improvement over pre-hoc retrieval strategy on HaluEval QA, reaching 69.45% accuracy.", 296 "evidence": "Table 3 shows CEG w/GPT-3.5-Turbo-Instruct at 69.45% vs. w/Pre-Retrieval at 61.35%, a difference of 8.10 percentage points.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "The method is a training-free plug-and-play plugin capable of working with various LLMs.", 301 "evidence": "Experiments use GPT-3.5, GPT-4, and Vicuna-33B as backbones (Tables 1-3). However, Vicuna-33B shows degraded performance with CEG compared to vanilla (Table 2: 52.0% vs 53.4% balanced).", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Post-hoc retrieval augmentation is more effective than pre-hoc retrieval for hallucination mitigation.", 306 "evidence": "Table 3 shows pre-retrieval (61.35%) performs worse than vanilla (63.40%), while CEG (69.00-69.45%) outperforms both. Table 2 shows CEG outperforms Doc and Link baselines on FELM.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "Each module (retrieval augmentation, threshold filtering) contributes to the overall framework performance.", 311 "evidence": "Table 4 ablation study on FELM shows removing RA drops balanced accuracy from 69.9% to 61.9% (GPT-4), and removing threshold drops it from 69.9% to 67.0%.", 312 "supported": "strong" 313 } 314 ], 315 "methodology_tags": ["benchmark-eval"], 316 "key_findings": "The paper proposes Citation-Enhanced Generation (CEG), a post-hoc framework that uses retrieval augmentation and NLI to add citations to LLM responses and regenerate when hallucinations are detected. CEG achieves state-of-the-art performance on WikiBio GPT-3 (77.59% balanced accuracy), FELM (69.9% with GPT-4), and HaluEval (69.45% accuracy). Ablation studies confirm both the retrieval augmentation and threshold filtering modules contribute to performance. The method works across GPT-3.5, GPT-4, and Vicuna-33B, though Vicuna-33B shows limited improvement.", 317 "red_flags": [ 318 { 319 "flag": "No statistical significance testing", 320 "detail": "All performance comparisons are based on single-point estimates without significance tests, confidence intervals, or error bars. Temperature is set to 0, which makes outputs deterministic for a given input but does not address sampling variability in dataset composition." 321 }, 322 { 323 "flag": "No contamination analysis", 324 "detail": "GPT models are used to evaluate on Wikipedia-based benchmarks without any discussion of whether these benchmarks or their source content appeared in the models' training data. This is particularly concerning for the NLI evaluation where the model's world knowledge may be confounded with NLI capability." 325 }, 326 { 327 "flag": "Generalization overclaimed", 328 "detail": "The paper claims the method is 'capable of various LLMs' and is broadly applicable, but only tests on three model families (GPT-3.5, GPT-4, Vicuna-33B) on Wikipedia-based knowledge QA tasks. Vicuna-33B actually performs worse with CEG than vanilla on FELM (52.0% vs 53.4% balanced accuracy), undermining the generality claim." 329 }, 330 { 331 "flag": "Annotator information missing", 332 "detail": "Three annotators are used for WikiRetr ground truth validation, but no information is provided about their qualifications, recruitment, or potential biases." 333 } 334 ], 335 "cited_papers": [ 336 { 337 "title": "Enabling large language models to generate text with citations", 338 "authors": ["Tianyu Gao", "Howard Yen", "Jiatong Yu", "Danqi Chen"], 339 "year": 2023, 340 "relevance": "Directly comparable prior work on citation-augmented LLM generation, representing the pre-hoc approach that CEG aims to improve upon." 341 }, 342 { 343 "title": "SelfCheckGPT: Zero-resource black-box hallucination detection for generative large language models", 344 "authors": ["Potsawee Manakul", "Adian Liusie", "Mark Gales"], 345 "year": 2023, 346 "relevance": "Key baseline for hallucination detection that CEG compares against; represents self-consistency approaches to LLM quality evaluation." 347 }, 348 { 349 "title": "HaluEval: A large-scale hallucination evaluation benchmark for large language models", 350 "authors": ["Junyi Li", "Xiaoxue Cheng", "Xin Zhao", "Jian-Yun Nie", "Ji-Rong Wen"], 351 "year": 2023, 352 "relevance": "Major hallucination evaluation benchmark used in this study; relevant to LLM quality assessment methodology." 353 }, 354 { 355 "title": "FELM: Benchmarking factuality evaluation of large language models", 356 "authors": ["Shiqi Chen", "Yiran Zhao", "Jinghan Zhang"], 357 "year": 2023, 358 "arxiv_id": "2310.00741", 359 "relevance": "Hallucination detection benchmark used in evaluation; provides methodology for factuality evaluation of LLMs." 360 }, 361 { 362 "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks", 363 "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"], 364 "year": 2020, 365 "relevance": "Foundational RAG paper that establishes retrieval-augmented generation as a technique for reducing hallucination in LLMs." 366 }, 367 { 368 "title": "Training language models to follow instructions with human feedback", 369 "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"], 370 "year": 2022, 371 "relevance": "InstructGPT/RLHF paper representing the alignment approach to hallucination reduction, an alternative to post-hoc methods." 372 }, 373 { 374 "title": "Survey of hallucination in natural language generation", 375 "authors": ["Ziwei Ji", "Nayeon Lee", "Rita Frieske"], 376 "year": 2023, 377 "relevance": "Comprehensive survey of hallucination in LLMs, providing taxonomy and background for hallucination mitigation research." 378 }, 379 { 380 "title": "Citation: A key to building responsible and accountable large language models", 381 "authors": ["Jie Huang", "Kevin Chen-Chuan Chang"], 382 "year": 2023, 383 "arxiv_id": "2307.02185", 384 "relevance": "Argues for citation as a mechanism for LLM accountability; directly motivates the citation-enhanced approach." 385 }, 386 { 387 "title": "Siren's song in the AI ocean: a survey on hallucination in large language models", 388 "authors": ["Yue Zhang", "Yafu Li", "Leyang Cui"], 389 "year": 2023, 390 "arxiv_id": "2309.01219", 391 "relevance": "Survey on LLM hallucination providing broader context for hallucination detection and mitigation methods." 392 }, 393 { 394 "title": "Judging LLM-as-a-judge with MT-bench and Chatbot Arena", 395 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 396 "year": 2023, 397 "arxiv_id": "2306.05685", 398 "relevance": "Evaluates LLM-as-a-judge methodology; relevant to using LLMs for evaluation and the NLI approach used in CEG." 399 }, 400 { 401 "title": "TRUE: Re-evaluating factual consistency evaluation", 402 "authors": ["Or Honovich", "Roee Aharoni", "Jonathan Herzig"], 403 "year": 2022, 404 "relevance": "Proposes True-9B NLI model used as a baseline in CEG's citation generation evaluation; relevant to factual consistency methods." 405 }, 406 { 407 "title": "Retrieval-augmented generation across heterogeneous knowledge", 408 "authors": ["Wenhao Yu"], 409 "year": 2022, 410 "relevance": "Extends RAG to heterogeneous knowledge sources; relevant to retrieval-based hallucination mitigation approaches." 411 } 412 ] 413 }