scan.json (26238B)
1 { 2 "paper": { 3 "title": "Benchmarking Hallucination in Large Language Models based on Unanswerable Math Word Problem", 4 "authors": [ 5 "Yuhong Sun", 6 "Zhangyue Yin", 7 "Qipeng Guo", 8 "Jiawen Wu", 9 "Xipeng Qiu", 10 "Hui Zhao" 11 ], 12 "year": 2024, 13 "venue": "arXiv", 14 "arxiv_id": "2403.03558" 15 }, 16 "checklist": { 17 "artifacts": { 18 "code_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "The abstract states 'Our code and data are available at https://github.com/Yuki-Asuuna/UMWP.' A working GitHub URL is provided." 22 }, 23 "data_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The dataset UMWP is released at the same GitHub repository. Additionally, the answerable portions come from publicly available datasets (GSM8K, SVAMP, MultiArith, ASDiv)." 27 }, 28 "environment_specified": { 29 "applies": true, 30 "answer": false, 31 "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency details are mentioned in the paper. The paper mentions using Spacy v3.6.1 and SimCSE but does not provide a full environment setup." 32 }, 33 "reproduction_instructions": { 34 "applies": true, 35 "answer": false, 36 "justification": "No step-by-step reproduction instructions are provided in the paper. While code is released on GitHub, the paper itself does not include a 'Reproducing Results' section or detailed instructions." 37 } 38 }, 39 "statistical_methodology": { 40 "confidence_intervals_or_error_bars": { 41 "applies": true, 42 "answer": false, 43 "justification": "All results are reported as point estimates (e.g., F1 scores in Figure 3 and Figure 4). No confidence intervals or error bars are shown on any figures or tables." 44 }, 45 "significance_tests": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper makes comparative claims (e.g., 'RLHF substantially improves the F1 score', 'ICL and Instruction input forms significantly improve') but no statistical significance tests are reported. Comparisons are based solely on raw F1 score differences." 49 }, 50 "effect_sizes_reported": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper reports raw F1 scores but does not provide effect sizes (e.g., Cohen's d) for its comparative claims about model size, RLHF, or input form effects. Cohen's kappa is reported for evaluation method agreement (Table 3) but this measures inter-rater agreement, not effect size for the main claims." 54 }, 55 "sample_size_justified": { 56 "applies": true, 57 "answer": false, 58 "justification": "The dataset contains 5,200 questions and the human benchmark uses 200 samples from 5 volunteers, but no justification is provided for why these sample sizes are sufficient. No power analysis is discussed." 59 }, 60 "variance_reported": { 61 "applies": true, 62 "answer": false, 63 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be from single runs with no indication of repeated experiments or result stability." 64 } 65 }, 66 "evaluation_design": { 67 "baselines_included": { 68 "applies": true, 69 "answer": true, 70 "justification": "The paper compares 31 LLMs across different series (GPT-3, InstructGPT, Claude, LLaMA) and includes a human benchmark (Section 5.2, Figure 4). The comparison between template-only and template+rule evaluation methods (Table 3) also serves as a baseline comparison." 71 }, 72 "baselines_contemporary": { 73 "applies": true, 74 "answer": true, 75 "justification": "The evaluated models include GPT-4-0613 and Claude-2, which were contemporary at the time of writing (2024). The LLaMA-2-70b-chat was also recent. The paper covers a wide range from older (text-ada-001) to state-of-the-art models." 76 }, 77 "ablation_study": { 78 "applies": true, 79 "answer": true, 80 "justification": "Table 3 compares two evaluation method variants: Template-only vs Template+Rule (mathematical expression detection), showing the contribution of the expression detection component. Section 5.1 also reports a threshold ablation for the similarity function." 81 }, 82 "multiple_metrics": { 83 "applies": true, 84 "answer": true, 85 "justification": "The paper uses F1 score as the primary metric and also reports Cohen's kappa coefficient for evaluation method agreement (Table 3). Accuracy of answerable questions is reported separately in Appendix A.4 (Figure 5)." 86 }, 87 "human_evaluation": { 88 "applies": true, 89 "answer": true, 90 "justification": "A human benchmark is established in Section 5.2 using 5 volunteers who evaluate 200 randomly sampled questions. The human F1 score of 93.16% is compared against LLMs (Figure 4). Additionally, 5 annotators participate in the evaluation method comparison (Section 5.4)." 91 }, 92 "held_out_test_set": { 93 "applies": true, 94 "answer": false, 95 "justification": "The entire UMWP dataset of 5,200 questions is used for evaluation. There is no mention of a held-out test set or dev/test split. All models are evaluated on the same full dataset." 96 }, 97 "per_category_breakdown": { 98 "applies": true, 99 "answer": true, 100 "justification": "Table 1 provides the distribution across five unanswerable question categories (Key Information Missing 32%, Ambiguous Key Information 49%, Unrealistic Conditions 11%, Unrelated Object 4%, Question Missing 5%). Results are broken down by model series (Figure 3) and by input form (Direct, Instruction, ICL)." 101 }, 102 "failure_cases_discussed": { 103 "applies": true, 104 "answer": true, 105 "justification": "Section A.6 (Table 6) provides a detailed case study of hallucination examples categorized by failure type (making arbitrary assumptions, ignoring scope condition, ignoring unreasonable condition, etc.). Section 5.5 discusses noise analysis of irrelevant outputs." 106 }, 107 "negative_results_reported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The paper reports that text-babbage-001 breaks the general trend of improvement with model size in the InstructGPT series (Section 5.4). The paper also shows that many models perform poorly, with some achieving very low F1 scores (e.g., near 0 for small LLaMA models in Direct form)." 111 } 112 }, 113 "claims_and_evidence": { 114 "abstract_claims_supported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The abstract claims that ICL and RLHF 'significantly enhance the model's ability to avoid hallucination,' which is supported by the results in Figure 3 showing consistent F1 improvements with RLHF and ICL input forms. The claim about testing 31 LLMs is verified in the experiments." 118 }, 119 "causal_claims_justified": { 120 "applies": true, 121 "answer": false, 122 "justification": "The paper makes causal claims such as 'RLHF substantially improves the F1 score' and 'ICL input forms...significantly improving the LLMs' ability to recognize hallucination.' These are observational comparisons between different model versions, not controlled experiments. The improvement could be due to many factors beyond RLHF alone (different training data, model architecture changes, etc.)." 123 }, 124 "generalization_bounded": { 125 "applies": true, 126 "answer": false, 127 "justification": "The title claims to benchmark 'Hallucination in Large Language Models' broadly, but the paper only tests hallucination in the specific context of unanswerable math word problems in English. The abstract claims 'utilizing MWP is a reliable and effective approach to assess hallucination' without bounding this to the tested domain. The Limitations section partially addresses this but the title and abstract overreach." 128 }, 129 "alternative_explanations_discussed": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper does not discuss alternative explanations for the observed results. For example, the RLHF improvement could be due to instruction-following ability rather than hallucination reduction. The model size effect could be confounded with training data differences. No threats-to-validity or alternative explanation analysis is provided." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": true, 139 "justification": "Most model versions are specified with snapshot dates or version identifiers: gpt-4-0613, gpt-4-0314, gpt-3.5-turbo-0613, gpt-3.5-turbo-0301, text-davinci-001/002/003, claude-1, claude-instant-1/1.1/1.2, claude-2, LLaMA-7b/13b/30b/65b, LLaMA-v2-7b/13b/70b and their chat variants. The GPT models include snapshot dates (e.g., 0613, 0314). Claude models use version numbers (claude-1, claude-2) which, while not full snapshot dates, were the standard identifiers at the time." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": true, 144 "justification": "Full prompt templates for all three input forms (Direct, Instruction, ICL) are provided in Figures 6, 7, and 8, including the complete ICL examples with 8 few-shot demonstrations. The instruction text and all fill values are shown." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": false, 149 "justification": "Section 5.1 states 'we set the temperature T = 0.7 for GPT, InstructGPT, LLaMA, and LLaMA-2' and reports the similarity threshold T=0.75, but temperature is not stated for Claude models. No other generation hyperparameters (top-p, max tokens) are reported for any model." 150 }, 151 "scaffolding_described": { 152 "applies": false, 153 "answer": false, 154 "justification": "No agentic scaffolding is used. The paper sends prompts directly to LLMs and evaluates their outputs. There is no multi-step agent workflow." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 3 describes the dataset construction process: source datasets (SVAMP, MultiArith, GSM8K, ASDiv), modification strategies (Table 5 with 8 strategies), annotator process (2 annotators modify, 3 volunteers validate, unanimous agreement required). Table 2 provides statistics of the answerable questions by source." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": true, 166 "justification": "A dedicated 'Limitations' section is present (after the Ethics Statement) discussing that the work focuses only on English QA and does not explore other tasks like summarization or code generation, and that it only addresses hallucination from the prompt engineering perspective." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": false, 171 "justification": "The Limitations section mentions only generic limitations: focus on English QA, not exploring other tasks, and only addressing prompt engineering perspective. These are not specific threats to the validity of the reported results (e.g., no discussion of evaluation method reliability, sample bias, or confounds in the RLHF comparison)." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": false, 176 "justification": "The Limitations section mentions focus on English QA but does not explicitly state what the results do NOT show. For example, it does not bound the generalization claims: results are about math word problem hallucination specifically, not hallucination in general. The title and abstract imply broader applicability than demonstrated." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": true, 183 "justification": "The UMWP dataset is released at the GitHub repository (https://github.com/Yuki-Asuuna/UMWP), allowing independent verification of the benchmark questions. The source datasets (GSM8K, SVAMP, MultiArith, ASDiv) are also publicly available." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "Section 3 describes the data collection: four source MWP datasets were used, two annotators modified answerable questions into unanswerable ones using 8 specific strategies (Table 5), and three volunteers validated each question (unanimous agreement required for acceptance). Statistics are provided in Tables 1 and 2." 189 }, 190 "recruitment_methods_described": { 191 "applies": true, 192 "answer": false, 193 "justification": "The paper mentions '5 volunteers' for the human benchmark and '2 data annotators' and '3 volunteers' for dataset construction, but provides no information about how these individuals were recruited, their qualifications, or potential selection bias. The Ethics Statement mentions compensation but not recruitment channels." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": true, 198 "justification": "The pipeline is documented: source datasets selected (Table 2) -> 2 annotators apply 8 modification strategies (Table 5) -> 3 volunteers validate -> questions with unanimous unanswerable annotations accepted -> final dataset of 2,600 answerable + 2,600 unanswerable questions. Algorithm 1 documents the evaluation pipeline." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": true, 205 "justification": "The Acknowledgments section states: 'This work is supported by National Key Research and Development Program of China(2022YFC3302600).'" 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "Author affiliations are listed: East China Normal University, Fudan University, and Shanghai AI Laboratory. None of the authors appear to be affiliated with the companies whose models are evaluated (OpenAI, Meta, Anthropic, Google)." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": true, 215 "justification": "The funder is the National Key Research and Development Program of China, a government funding body that has no financial interest in the relative performance of any specific LLM evaluated in this study." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests or financial interests statement is included in the paper. While the authors appear to be academic researchers with no obvious conflicts, the absence of an explicit declaration means this criterion is not satisfied." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": true, 226 "answer": false, 227 "justification": "The paper evaluates 31 LLMs on the UMWP benchmark but does not state the training data cutoff dates for any of the models. This is relevant because UMWP is built from publicly available datasets (GSM8K, SVAMP, etc.) that could be in the training data." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": true, 231 "answer": false, 232 "justification": "The answerable questions in UMWP come directly from publicly available datasets (GSM8K, SVAMP, MultiArith, ASDiv) that are likely in the training data of the evaluated models. The paper does not discuss this potential contamination issue at all." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": true, 236 "answer": false, 237 "justification": "GSM8K (2021), SVAMP (2021), MultiArith (2016), and ASDiv (2020) were all published before the training cutoffs of models like GPT-4 and LLaMA-2. The answerable portion of UMWP uses these questions directly, creating contamination risk for the answerable question accuracy. This is not discussed." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "The human involvement (5 volunteers for benchmarking, annotators for dataset creation) is minimal and serves as a reference point, not a human subjects study. This is a benchmark evaluation paper." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "The human participants are volunteers performing annotation and benchmark tasks, not subjects of a human study. The Ethics Statement discusses data ethics but this is not a human subjects study requiring IRB approval." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "This is a benchmark evaluation paper. The volunteers and annotators serve as reference points, not as subjects of a human study." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "Not a human subjects study. Annotators and volunteers are tools for dataset construction and validation, not study participants." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "Not an experimental study with human participants assigned to conditions." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "Not an experimental study with human participants requiring blinding." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "Not a human subjects study with participant attrition concerns." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": false, 281 "justification": "The paper runs inference on 31 LLMs across 5,200 questions in 3 input forms, which involves substantial API costs. No inference costs, tokens consumed, or wall-clock time are reported." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": false, 286 "justification": "No computational budget information is provided. The paper does not state the hardware used for running open-source models (LLaMA series) or the total API spend for commercial models." 287 } 288 } 289 }, 290 "claims": [ 291 { 292 "claim": "In-context learning and RLHF training significantly enhance the model's ability to avoid hallucination on unanswerable math word problems.", 293 "evidence": "Figure 3 shows F1 score improvements with ICL input form vs Direct for all model series. RLHF chat models (LLaMA-v2-7b-chat, 13b-chat, 70b-chat) consistently outperform their base counterparts across all three input forms (Section 5.4).", 294 "supported": "moderate" 295 }, 296 { 297 "claim": "Larger model size generally leads to better hallucination avoidance (higher F1 scores).", 298 "evidence": "Figure 3 shows continuous F1 improvement as model size increases in the LLaMA and InstructGPT series, with the exception of text-babbage-001 in InstructGPT (Section 5.4, 'Model Size').", 299 "supported": "moderate" 300 }, 301 { 302 "claim": "GPT-4 achieves the best LLM performance with an F1 score of 85.24% under instruction input, but still falls short of the human benchmark of 93.16%.", 303 "evidence": "Figure 4 directly shows these numbers. The human benchmark was established with 5 volunteers on 200 randomly sampled questions (Section 5.2).", 304 "supported": "moderate" 305 }, 306 { 307 "claim": "Combining text similarity templates with mathematical expression detection improves consistency with human judgment compared to template-only evaluation.", 308 "evidence": "Table 3 shows Cohen's kappa improvement across 5 models when adding rule-based expression detection: improvements range from +0.027 (gpt-4) to +0.072 (text-davinci-003). All kappa values exceed 0.75.", 309 "supported": "strong" 310 }, 311 { 312 "claim": "LLaMA-v2-13b-chat's performance can compete with LLaMA-65b despite having significantly fewer parameters.", 313 "evidence": "Figure 3 shows LLaMA-v2-13b-chat achieving comparable F1 scores to LLaMA-65b across input forms (Section 5.4, 'RLHF').", 314 "supported": "moderate" 315 } 316 ], 317 "methodology_tags": [ 318 "benchmark-eval" 319 ], 320 "key_findings": "The paper introduces UMWP, a dataset of 5,200 math word problems (half answerable, half unanswerable) for benchmarking LLM hallucination. Testing 31 LLMs across three input forms reveals that model size, RLHF training, and in-context learning all improve hallucination avoidance, with GPT-4 achieving the best F1 score (85.24%) but still trailing the human benchmark (93.16%). The paper also shows that combining template-based text similarity with mathematical expression detection produces evaluation results more consistent with human judgment (Cohen's kappa > 0.75 for all tested models).", 321 "red_flags": [ 322 { 323 "flag": "No uncertainty quantification", 324 "detail": "All experimental results are reported as single-point F1 scores with no error bars, confidence intervals, or variance across runs. Given that LLM outputs can vary with temperature=0.7, single-run results may not be stable." 325 }, 326 { 327 "flag": "Benchmark contamination risk unaddressed", 328 "detail": "The answerable half of UMWP comes directly from GSM8K, SVAMP, MultiArith, and ASDiv, all publicly available before the training cutoffs of most evaluated models. Models that memorized these problems would perform differently on answerable vs novel unanswerable variants, but this is never discussed." 329 }, 330 { 331 "flag": "Causal claims from observational comparisons", 332 "detail": "The paper attributes F1 improvements specifically to RLHF and model size, but these comparisons confound multiple factors (different training data, architectural changes, training procedures). The claim that 'RLHF substantially improves' performance cannot be isolated from these confounds." 333 }, 334 { 335 "flag": "Overly broad title and claims", 336 "detail": "The title 'Benchmarking Hallucination in Large Language Models' and the abstract claim that 'MWP is a reliable and effective approach to assess hallucination' suggest general applicability, but the paper only tests one specific type of hallucination (failing to recognize unanswerable math problems) in English." 337 }, 338 { 339 "flag": "No statistical significance tests", 340 "detail": "Comparative claims about the effects of model size, RLHF, and input forms are supported only by visual comparison of F1 scores, without any statistical tests to confirm the differences are meaningful." 341 } 342 ], 343 "cited_papers": [ 344 { 345 "title": "A survey of hallucination in natural language generation", 346 "authors": ["Ziwei Ji", "Nayeon Lee", "Rita Frieske"], 347 "year": 2023, 348 "relevance": "Comprehensive survey of hallucination in NLG systems, directly relevant to LLM reliability and evaluation methodology." 349 }, 350 { 351 "title": "TruthfulQA: Measuring how models mimic human falsehoods", 352 "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"], 353 "year": 2022, 354 "relevance": "Benchmark for evaluating LLM truthfulness, a key methodology for assessing LLM reliability." 355 }, 356 { 357 "title": "HaluEval: A large-scale hallucination evaluation benchmark for large language models", 358 "authors": ["Junyi Li", "Xiaoxue Cheng", "Wayne Xin Zhao"], 359 "year": 2023, 360 "relevance": "Large-scale hallucination evaluation benchmark for LLMs, directly comparable methodology for assessing hallucination." 361 }, 362 { 363 "title": "FActScore: Fine-grained atomic evaluation of factual precision in long form text generation", 364 "authors": ["Sewon Min", "Kalpesh Krishna", "Xinxi Lyu"], 365 "year": 2023, 366 "arxiv_id": "2305.14251", 367 "relevance": "Automated factuality evaluation methodology for LLMs, relevant to benchmarking and evaluation design." 368 }, 369 { 370 "title": "Chain-of-thought prompting elicits reasoning in large language models", 371 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 372 "year": 2022, 373 "relevance": "Foundational prompting technique for LLM reasoning, relevant to understanding how prompting affects LLM capability." 374 }, 375 { 376 "title": "Training language models to follow instructions with human feedback", 377 "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"], 378 "year": 2022, 379 "relevance": "InstructGPT paper describing RLHF training, directly relevant to understanding alignment and instruction-following capability." 380 }, 381 { 382 "title": "A survey of large language models", 383 "authors": ["Wayne Xin Zhao", "Kun Zhou", "Junyi Li"], 384 "year": 2023, 385 "arxiv_id": "2303.18223", 386 "relevance": "Comprehensive LLM survey relevant to understanding the landscape of models evaluated in LLM benchmarking studies." 387 }, 388 { 389 "title": "Do large language models know what they don't know?", 390 "authors": ["Zhangyue Yin", "Qiushi Sun", "Qipeng Guo"], 391 "year": 2023, 392 "relevance": "Evaluates LLM self-knowledge using unanswerable questions, directly related to hallucination benchmarking methodology." 393 }, 394 { 395 "title": "Training verifiers to solve math word problems", 396 "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian"], 397 "year": 2021, 398 "arxiv_id": "2110.14168", 399 "relevance": "Introduces GSM8K dataset used as source for UMWP, relevant to LLM math reasoning evaluation methodology." 400 }, 401 { 402 "title": "GPT-4 technical report", 403 "authors": ["OpenAI"], 404 "year": 2023, 405 "relevance": "Technical report for GPT-4, the top-performing model in this benchmark, relevant to LLM capability evaluation." 406 } 407 ] 408 }