scan.json (24970B)
1 { 2 "paper": { 3 "title": "Check Your Facts and Try Again: Improving Large Language Models with External Knowledge and Automated Feedback", 4 "authors": [ 5 "Baolin Peng", 6 "Michel Galley", 7 "Pengcheng He", 8 "Hao Cheng", 9 "Yujia Xie", 10 "Yu Hu", 11 "Qiuyuan Huang", 12 "Lars Liden", 13 "Zhou Yu", 14 "Weizhu Chen", 15 "Jianfeng Gao" 16 ], 17 "year": 2023, 18 "venue": "arXiv", 19 "arxiv_id": "2302.12813" 20 }, 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The paper states 'We make the source code and models publicly available' in the abstract, and provides a URL: https://aka.ms/llm-augmenter (footnote 1)." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "The paper uses publicly available datasets: DSTC7 Track 2 (News Chat), DSTC11 Track 5 (Customer Service), and OTT-QA (Wiki QA). All are public benchmarks." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "No environment specifications, requirements.txt, Dockerfile, or detailed dependency listing is provided in the paper." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No step-by-step reproduction instructions are provided in the paper. The code is released, but the paper itself does not include instructions for replicating experiments." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "The main results in Tables 1, 2, and 5 report only point estimates. Figure 3 shows shaded regions for max/min over 5 runs for the RL learning curve, but the main evaluation results lack confidence intervals or error bars." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": true, 53 "justification": "Table 3 reports 'All differences are significant (p < 0.05)' for the human evaluation. Significance testing is mentioned for human evaluation results." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "The paper reports effect sizes with baseline context, e.g., 'improve ChatGPT by 32.3% in Usefulness' and 'absolute +10% in F1' and specific point improvements like '10 and 6 points' in KF1. Tables provide both baseline and improved scores." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "No justification is provided for the sample sizes used. The 948 examples for human evaluation and 1370 examples for News Chat evaluation are stated without justification for why these sizes are sufficient." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "Figure 3 shows max/min over 5 runs for the RL policy training curve only. The main automatic evaluation results in Tables 1, 2, and 5 do not report variance, standard deviation, or any spread measure across runs." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "ChatGPT without augmentation serves as the baseline across all experiments. For Wiki QA, DPR vs. CORE retrieval is also compared. The paper also mentions comparison with state-of-the-art fine-tuned models (Ma et al., 2022)." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": false, 80 "justification": "The primary baseline is ChatGPT alone (zero-shot). There are no comparisons against other contemporary retrieval-augmented approaches (e.g., REPLUG, WebGPT, or other RAG methods mentioned in related work). The only fine-tuned comparison is Ma et al. (2022)." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "Section 3.5 provides ablation studies: (1) different policies for knowledge consolidator usage (Figure 4), (2) different feedback types (Table 4), (3) utility function + feedback combinations (Figure 5). These ablate individual components." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "The paper uses 8 automatic metrics for dialog tasks (KF1, BLEU, ROUGE, chrF, METEOR, BERTScore, BARTScore, BLEURT) and precision/recall/F1 for Wiki QA. Human evaluation also includes Usefulness and Humanness." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": true, 95 "justification": "Section 3.4 reports human evaluation using Amazon Mechanical Turk with 948 examples. Workers rated Usefulness and Humanness on a 5-point Likert-like scale (Table 3)." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "For News Chat, a dedicated evaluation set of 1370 examples is used. For Customer Service, the validation set is used (test set was unavailable). For Wiki QA, a test set is used. The RL policy training uses test data for evaluation (Figure 3)." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": false, 105 "justification": "No per-category breakdown is provided. The Wiki QA dataset has single-hop (13%), two-hop (57%), and multi-hop (30%) categories, but results are not broken down by these categories." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Section 4.3 mentions that 'most error cases are hallucinated answers and ChatGPT abstains from answering for 17% cases.' Table 6 in the appendix shows a qualitative example of the feedback loop correcting an initial bad response." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper notes that 'there still remains a noticeable gap in performance' compared to fine-tuned models (Section 4.3), that inter-annotator agreement is low (0.15 and 0.07 Krippendorff's alpha in Table 3), and acknowledges latency issues in Section 6." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The abstract claims LLM-AUGMENTER 'significantly reduces ChatGPT's hallucinations without sacrificing the fluency and informativeness.' Tables 1, 2, 5 show improved KF1 while maintaining or improving fluency metrics, and Table 3 shows human evaluation support." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "The paper makes causal claims (e.g., 'incorporating feedback leads to substantial improvement'). The ablation studies (Section 3.5) systematically add/remove components to establish causal contribution of each module. This is adequate controlled manipulation." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": false, 132 "justification": "The title 'Improving Large Language Models with External Knowledge and Automated Feedback' implies generality, but experiments are only on ChatGPT across three specific tasks. While the paper mentions applicability to 'other LLMs such as GPT-3 or PaLM,' this is not tested." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper does not discuss alternative explanations for the improvements. For example, improvements could be partly due to longer or more verbose responses, or the added prompting context could be doing the work rather than the iterative feedback mechanism specifically. No confounds are analyzed." 138 } 139 }, 140 "setup_transparency": { 141 "model_versions_specified": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper refers to 'ChatGPT' throughout without specifying a version, snapshot date, or API version. The T5-Base model is named but ChatGPT is unversioned. The paper mentions using ChatGPT's 'current limited bandwidth' suggesting early API access but no version is specified." 145 }, 146 "prompts_provided": { 147 "applies": true, 148 "answer": true, 149 "justification": "Appendix Tables 7, 8, and 9 provide the actual prompt templates used for News Chat, Customer Service, and Wiki QA respectively. The templates include the exact instruction text and structure with placeholders for dynamic content." 150 }, 151 "hyperparameters_reported": { 152 "applies": true, 153 "answer": false, 154 "justification": "No temperature, top-p, or other LLM API parameters are reported for ChatGPT. The paper does not mention sampling settings. For the RL policy, the T5-Base model is mentioned but training hyperparameters are not detailed." 155 }, 156 "scaffolding_described": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 2 provides detailed descriptions of the agentic scaffolding: Working Memory (Section 2.1), Policy (Section 2.2), Action Executor with Knowledge Consolidator and Prompt Engine (Section 2.3), and Utility module (Section 2.4). The MDP formulation and feedback loop are well-documented." 160 }, 161 "data_preprocessing_documented": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 3.1 documents the News Chat data crawling process: selecting Reddit threads with URLs from 2021-2022, restricting to curated news websites, selecting oracle passages via ROUGE-F1, filtering by F1 threshold, resulting in 1370 examples. DSTC11 and OTT-QA are standard benchmarks." 165 } 166 }, 167 "limitations_and_scope": { 168 "limitations_section_present": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 6 'Limitations and Future Directions' is a dedicated section discussing limitations including latency, bandwidth constraints, and absence of human evaluation for Wiki QA." 172 }, 173 "threats_to_validity_specific": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 6 discusses specific threats: (1) interactive feedback with ChatGPT significantly slows down user experience, (2) RL experiments used T5-Base instead of ChatGPT due to bandwidth limitations, (3) human evaluation was initially not included for Wiki QA. These are specific to this study." 177 }, 178 "scope_boundaries_stated": { 179 "applies": true, 180 "answer": false, 181 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound generalizations to specific LLMs, tasks, or domains tested. The limitations section discusses practical constraints but not the boundaries of the claims themselves." 182 } 183 }, 184 "data_integrity": { 185 "raw_data_available": { 186 "applies": true, 187 "answer": true, 188 "justification": "The underlying datasets (DSTC7, DSTC11, OTT-QA) are publicly available benchmarks. The source code and models are also publicly released at https://aka.ms/llm-augmenter, which would include generated outputs." 189 }, 190 "data_collection_described": { 191 "applies": true, 192 "answer": true, 193 "justification": "Section 3.1 describes the News Chat data collection: crawling Reddit threads from 2021-2022, filtering by news websites, extracting oracle passages via ROUGE-F1, and applying threshold filtering. Customer Service uses DSTC11 Track 5 with 14768 sessions." 194 }, 195 "recruitment_methods_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "For human evaluation (Section 3.4), the paper describes: 'using Amazon Mechanical Turk, we hired master-level workers with lifetime HIT acceptance rate above 95%.' This describes the recruitment channel and qualification criteria." 199 }, 200 "data_pipeline_documented": { 201 "applies": true, 202 "answer": true, 203 "justification": "Section 3.1 documents the pipeline: Reddit threads -> URL domain filtering -> oracle passage extraction via ROUGE-F1 -> threshold filtering -> 1370 examples. For Wiki QA, the OTT-QA pipeline including DPR retrieval, entity linking, and evidence chaining is described in Section 4.2." 204 } 205 }, 206 "conflicts_of_interest": { 207 "funding_disclosed": { 208 "applies": true, 209 "answer": false, 210 "justification": "No funding section is present. The Acknowledgements section thanks individuals for discussions but does not disclose funding sources." 211 }, 212 "affiliations_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Author affiliations are clearly stated: Microsoft Research and Columbia University. All but one author (Zhou Yu) are from Microsoft Research." 216 }, 217 "funder_independent_of_outcome": { 218 "applies": true, 219 "answer": false, 220 "justification": "The work is from Microsoft Research. Microsoft has a direct commercial interest in the success of LLM augmentation systems (Azure OpenAI, Bing Chat, etc.). The funder is not independent of the outcome. No funding disclosure is provided to assess this, but the affiliation itself creates a non-independent relationship." 221 }, 222 "financial_interests_declared": { 223 "applies": true, 224 "answer": false, 225 "justification": "No competing interests statement is present. Microsoft Research authors working on LLM augmentation tools have potential commercial interests that are not declared." 226 } 227 }, 228 "contamination": { 229 "training_cutoff_stated": { 230 "applies": true, 231 "answer": false, 232 "justification": "The paper uses ChatGPT but does not state its training data cutoff date. This is relevant because the DSTC7 and DSTC11 datasets may have been in ChatGPT's training data." 233 }, 234 "train_test_overlap_discussed": { 235 "applies": true, 236 "answer": false, 237 "justification": "No discussion of whether ChatGPT's training data includes the DSTC7, DSTC11, or OTT-QA datasets. These are publicly available benchmarks that could have been in the training data." 238 }, 239 "benchmark_contamination_addressed": { 240 "applies": true, 241 "answer": false, 242 "justification": "OTT-QA was published in 2020, DSTC7 in 2019, and DSTC9/11 in 2020/2023. ChatGPT was trained on data up to at least 2021. The paper does not address whether these benchmarks were in the training data." 243 } 244 }, 245 "human_studies": { 246 "pre_registered": { 247 "applies": true, 248 "answer": false, 249 "justification": "No pre-registration is mentioned for the human evaluation study." 250 }, 251 "irb_or_ethics_approval": { 252 "applies": true, 253 "answer": false, 254 "justification": "No IRB or ethics board approval is mentioned despite using Amazon Mechanical Turk workers for human evaluation." 255 }, 256 "demographics_reported": { 257 "applies": true, 258 "answer": false, 259 "justification": "No demographics are reported for the Mechanical Turk workers beyond 'master-level workers with lifetime HIT acceptance rate above 95%.' No information on experience, geographic distribution, or other characteristics." 260 }, 261 "inclusion_exclusion_criteria": { 262 "applies": true, 263 "answer": true, 264 "justification": "The paper specifies inclusion criteria: 'master-level workers with lifetime HIT acceptance rate above 95%' (Section 3.4). This defines who was eligible to participate." 265 }, 266 "randomization_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "The human evaluation is a rating study comparing outputs of two systems, not an experimental study with treatment/control groups requiring randomization." 270 }, 271 "blinding_described": { 272 "applies": true, 273 "answer": false, 274 "justification": "No blinding is described for the human evaluation. It is not stated whether evaluators knew which responses came from ChatGPT vs. LLM-AUGMENTER." 275 }, 276 "attrition_reported": { 277 "applies": true, 278 "answer": false, 279 "justification": "No attrition information is provided. The paper states 948 examples were used but does not report how many workers participated, how many dropped out, or quality control filtering." 280 } 281 }, 282 "cost_and_practicality": { 283 "inference_cost_reported": { 284 "applies": true, 285 "answer": false, 286 "justification": "No inference cost, API cost, or latency numbers are reported. Section 6 qualitatively mentions that 'interactive feedback with a computationally expensive model such as ChatGPT can significantly slow down the user experience' and that 'ChatGPT is often queried twice for a single response,' but no quantitative cost data is given." 287 }, 288 "compute_budget_stated": { 289 "applies": true, 290 "answer": false, 291 "justification": "No total computational budget, API spend, GPU hours, or hardware specifications are reported." 292 } 293 } 294 }, 295 "claims": [ 296 { 297 "claim": "LLM-AUGMENTER significantly reduces ChatGPT's hallucinations without sacrificing fluency and informativeness.", 298 "evidence": "Tables 1, 2 show KF1 improvements of ~10 points (News Chat) and ~6 points (Customer Service) with BM25 retrieval, while maintaining or improving BLEU, ROUGE, chrF, METEOR, BERTScore, BARTScore, and BLEURT. Table 3 human evaluation shows 32.3% improvement in Usefulness and 12.9% in Humanness.", 299 "supported": "strong" 300 }, 301 { 302 "claim": "Automated feedback further improves response quality beyond external knowledge alone.", 303 "evidence": "Tables 1 and 2 show consistent improvements when feedback is added: +3.3 KF1 on News Chat and +7.2 on Customer Service with golden knowledge. Table 4 ablation shows w/ Rule-based Feedback achieves 37.41 vs. 34.07 without feedback.", 304 "supported": "strong" 305 }, 306 { 307 "claim": "Knowledge consolidation (CORE) is superior to raw retrieval (DPR) for grounding LLM responses.", 308 "evidence": "Table 5 shows CORE achieves 8.08 F1 vs. DPR's 2.38 F1 on Wiki QA, a substantial improvement from evidence consolidation.", 309 "supported": "strong" 310 }, 311 { 312 "claim": "A trainable RL policy can learn to effectively select actions to maximize reward.", 313 "evidence": "Figure 3 shows the learning curve with T5-Base policy improving from random baseline (~33 KF1) to ~37.5 KF1 over 1000 training episodes, averaged over 5 runs. However, this is with T5-Base, not ChatGPT, due to bandwidth limitations.", 314 "supported": "moderate" 315 }, 316 { 317 "claim": "LLM-AUGMENTER is applicable to other LLMs beyond ChatGPT.", 318 "evidence": "The paper states 'It is straightforward to apply LLM-AUGMENTER to other LLMs, such as GPT-3 or PaLM' (Section 3.2), but no experiments with other LLMs are provided.", 319 "supported": "unsupported" 320 } 321 ], 322 "methodology_tags": [ 323 "benchmark-eval" 324 ], 325 "key_findings": "LLM-AUGMENTER is a plug-and-play framework that augments black-box LLMs with external knowledge retrieval and automated feedback to reduce hallucinations. On information-seeking dialog tasks (DSTC7/DSTC11), it improves ChatGPT's Knowledge F1 by 6-10 points with BM25 retrieval and by 21-34 points with golden knowledge, while maintaining fluency metrics. Human evaluation shows 32.3% Usefulness improvement. On multi-hop Wiki QA (OTT-QA), knowledge consolidation via CORE with feedback achieves 11.80 F1 vs. 0.59 for closed-book ChatGPT, but still lags behind fine-tuned state-of-the-art models.", 326 "red_flags": [ 327 { 328 "flag": "Company evaluating own product-adjacent technology", 329 "detail": "The work is primarily from Microsoft Research. Microsoft has commercial interests in LLM augmentation (Azure OpenAI, Bing Chat). While the system is open-sourced, the evaluation setting favors demonstrating the value of augmentation infrastructure Microsoft could commercialize." 330 }, 331 { 332 "flag": "Unversioned model", 333 "detail": "ChatGPT is used without specifying a version or API snapshot date. ChatGPT behavior changed significantly during early 2023, making these results non-reproducible without knowing exactly which model version was used." 334 }, 335 { 336 "flag": "Very low inter-annotator agreement in human evaluation", 337 "detail": "Table 3 reports Krippendorff's alpha of 0.15 for Usefulness and 0.07 for Humanness. An alpha of 0.07 indicates essentially no agreement beyond chance. The human evaluation results are therefore unreliable despite being reported as significant." 338 }, 339 { 340 "flag": "No contamination analysis", 341 "detail": "All three evaluation benchmarks (DSTC7, DSTC11, OTT-QA) are publicly available and may have been in ChatGPT's training data. The baseline ChatGPT performance may be artificially inflated or deflated due to data contamination, which is not discussed." 342 }, 343 { 344 "flag": "No variance reported for main results", 345 "detail": "The main automatic evaluation results in Tables 1, 2, and 5 appear to be single-run results with no variance reporting. Only the RL training curve (Figure 3) shows multi-run spread." 346 } 347 ], 348 "cited_papers": [ 349 { 350 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 351 "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandara Piktus"], 352 "year": 2020, 353 "relevance": "Foundational RAG paper proposing joint retriever-generator architectures for knowledge-grounded text generation." 354 }, 355 { 356 "title": "REALM: Retrieval-Augmented Language Model Pre-training", 357 "authors": ["Kelvin Guu", "Kenton Lee", "Zora Tung"], 358 "year": 2020, 359 "relevance": "Pioneering work on retrieval-augmented language model pretraining, directly relevant to augmenting LLMs with external knowledge." 360 }, 361 { 362 "title": "Retrieval Augmentation Reduces Hallucination in Conversation", 363 "authors": ["Kurt Shuster", "Spencer Poff", "Moya Chen"], 364 "year": 2021, 365 "arxiv_id": "2104.07567", 366 "relevance": "Directly addresses hallucination reduction through retrieval augmentation, a core premise of this paper." 367 }, 368 { 369 "title": "REPLUG: Retrieval-Augmented Black-Box Language Models", 370 "authors": ["Weijia Shi", "Sewon Min", "Michihiro Yasunaga"], 371 "year": 2023, 372 "relevance": "Contemporary approach to augmenting black-box LLMs with retrieval, directly comparable to LLM-AUGMENTER." 373 }, 374 { 375 "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", 376 "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessi"], 377 "year": 2023, 378 "relevance": "Related approach to augmenting LLMs with external tool use, relevant to the agentic AI tool-use paradigm." 379 }, 380 { 381 "title": "WebGPT: Browser-assisted question-answering with human feedback", 382 "authors": ["Reiichiro Nakano", "Jacob Hilton", "Suchir Balaji"], 383 "year": 2021, 384 "arxiv_id": "2112.09332", 385 "relevance": "Combines web knowledge with LLMs for question answering, a closely related system to LLM-AUGMENTER." 386 }, 387 { 388 "title": "Rethinking with Retrieval: Faithful Large Language Model Inference", 389 "authors": ["Hangfeng He", "Hongming Zhang", "Dan Roth"], 390 "year": 2022, 391 "relevance": "Proposes making LLMs more faithful through retrieval, directly relevant to the hallucination reduction goal." 392 }, 393 { 394 "title": "Language Models are Few-Shot Learners", 395 "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"], 396 "year": 2020, 397 "relevance": "Foundational GPT-3 paper relevant to understanding the LLMs being augmented in this work." 398 }, 399 { 400 "title": "Training Language Models to Follow Instructions with Human Feedback", 401 "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"], 402 "year": 2022, 403 "relevance": "InstructGPT/RLHF paper relevant to understanding ChatGPT's instruction-following capabilities used in LLM-AUGMENTER." 404 }, 405 { 406 "title": "Improving Alignment of Dialogue Agents via Targeted Human Judgements", 407 "authors": ["Amelia Glaese", "Nathan McAleese", "Maja Trkebacz"], 408 "year": 2022, 409 "relevance": "Inspired the utility function design in LLM-AUGMENTER for evaluating response alignment." 410 }, 411 { 412 "title": "GODEL: Large-Scale Pre-Training for Goal-Directed Dialog", 413 "authors": ["Baolin Peng", "Michel Galley", "Pengcheng He"], 414 "year": 2022, 415 "relevance": "Prior work by the same authors on dialog systems, providing the evaluation protocol used in this paper." 416 } 417 ] 418 }