scan.json (24332B)
1 { 2 "paper": { 3 "title": "Large Language Models are Better Reasoners with Self-Verification", 4 "authors": ["Yixuan Weng", "Minjun Zhu", "Fei Xia", "Bin Li", "Shizhu He", "Shengping Liu", "Bin Sun", "Kang Liu", "Jun Zhao"], 5 "year": 2022, 6 "venue": "Findings of EMNLP 2023", 7 "arxiv_id": "2212.09561", 8 "doi": "10.18653/v1/2023.findings-emnlp.167" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Self-verification improves LLM reasoning by using backward verification to validate candidate answers generated via chain-of-thought prompting. The method achieves consistent improvements across 8 reasoning datasets (e.g., +4.33% on GSM8K with Instruct-GPT), combines additively with self-consistency and PAL, and shows self-verification is an emergent ability of larger models. Condition Mask Verification outperforms True-False Item Verification on arithmetic tasks.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "GitHub link provided in abstract: https://github.com/WENGSYX/Self-Verification." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "All 8 datasets used are publicly available standard benchmarks (GSM8K, SingleEq, AddSub, MultiArith, AQUA-RAT, SVAMP, CSQA, Date Understanding). URLs provided in Appendix A.2." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specifications, requirements.txt, or dependency details provided in the paper." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions provided. The paper describes the method and hyperparameters but lacks a reproducibility guide." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Results in Table 1 are point estimates only. No confidence intervals or error bars reported despite running each experiment three times." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "No statistical significance tests performed. Claims of improvement are based solely on comparing raw accuracy numbers." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Table 1 reports absolute improvements in parentheses with baseline context (e.g., '60.81 → 65.14 (+4.33)'), providing both magnitude and baseline reference." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for the choice of K=5 candidate answers or P=10 verification rounds beyond 'we recommend choosing an appropriate value for P (e.g. P=10).' No power analysis." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Paper states 'we ran each experiment three times and calculated the average result' (Section 4.4) but does not report standard deviation or any spread measure across those runs." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Table 1 compares against CoT baseline, self-consistency (Wang et al., 2023c), PAL (Gao et al., 2023), and previous fine-tuned SOTA results." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Self-consistency (2023) and PAL (2023) were contemporary methods at time of submission. Previous SOTA fine-tuned results also included." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Multiple ablation-style analyses: single vs. multiple condition masks (Figure 5), CMV vs. TFV (Figure 6), varying P values (Figure 7), varying number of few-shot prompts (Figure 4)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": false, 83 "justification": "Only accuracy (problem solve rate) is reported as the evaluation metric. No other metrics used." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is not relevant for measuring accuracy on reasoning benchmarks with ground-truth answers." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are reported on standard test splits of established benchmarks (GSM8K test set of 1319 examples, etc., as shown in Table 6)." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results broken down across 8 individual datasets spanning 3 reasoning task types (arithmetic, commonsense, logical). Table 2 provides further per-dataset analysis." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table 7 in the appendix shows specific failure examples for each dataset with incorrect self-verification outputs marked with [✗] and ground truth answers." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Figure 3b shows self-verification has negative impact on smaller models (0.4B and 1.3B). Paper acknowledges 'it is challenging to augment the reasoning performance of smaller language models' in Limitations." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims of improvement on arithmetic, commonsense, and logical reasoning datasets are supported by Table 1 results. The specific numbers cited (60.8→65.1 on GSM8K, 91.01→93.40 on SingleEq) match the table." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims about self-verification improving performance. The ablation studies (Figures 5-7) use controlled single-variable manipulation, which is adequate for the claims." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "Title claims LLMs 'are Better Reasoners with Self-Verification' but experiments only use GPT-3 variants (code-davinci-001, code-davinci-002, text-ada/babbage/curie/davinci). No testing on other LLM families." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No discussion of alternative explanations for the improvements. For example, self-verification uses additional compute (multiple decoding passes), but no comparison against simply generating more candidate answers with the same compute budget." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "Paper measures accuracy on reasoning benchmarks and claims improvements on reasoning benchmarks — no proxy gap. Claims match measurement granularity." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific model identifiers provided: code-davinci-001, code-davinci-002, text-ada-001, text-babbage-001, text-curie-001, text-davinci-002 (Section 4.2, Figure 3)." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full prompt text provided in Appendix A.5-A.7 (Tables 8-17) for both forward reasoning and backward verification across all task types." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 4.4: K=5 candidate answers, P=10 verification iterations, max token length 168, sampling decoding without top-k truncation. Temperature not explicitly stated but sampling decoding specified." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding used. The method is a prompting strategy with two-stage inference (forward reasoning + backward verification), not an agent scaffold." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Answer cleansing strategy described in detail in Appendix A.1 with pseudo code (Table 3) for each answer format (number, multiple choice, true/false, yes/no, free format)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Dedicated 'Limitations' section present after the Conclusion, discussing dependency on LLMs, prompt bias, model size constraints, and computational cost." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Limitations section discusses specific threats: prompts are 'artificially constructed and may introduce bias', effectiveness depends on 'accurate answers within the candidate conclusions', and smaller models cannot benefit (citing Figure 3)." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Paper states method 'is not suitable for evaluating the LLM's inference procedure' (only conclusions), acknowledges it 'is challenging to augment the reasoning performance of smaller language models', and notes arithmetic tasks benefit more than general tasks." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw experimental outputs (model predictions, verification scores) are released. Only aggregated accuracy numbers reported." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Data sources are standard public benchmarks. Dataset details including size, average word count, answer format, and license listed in Table 6 with URLs in Appendix A.2." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard benchmarks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "Pipeline from input to output is documented: CoT prompting → sampling K=5 candidates → rewriting conclusions → condition masking/TFV → scoring → selection. Answer cleansing in Appendix A.1." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Acknowledgements section lists National Key R&D Program of China, NSFC grants, Strategic Priority Research Program of CAS, Youth Innovation Promotion Association CAS, and OPPO Research Fund." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All author affiliations listed: CAS, University of Chinese Academy of Sciences, Hunan University, Unisound, Shanghai AI Laboratory." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "Funders are government research agencies (NSFC, CAS) and OPPO Research Fund. None have a direct stake in whether self-verification improves GPT-3 reasoning." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement provided. One author is affiliated with Unisound (a commercial company) but no declaration of financial interests." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No mention of training data cutoff dates for any of the GPT-3 models used." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether benchmark datasets (some dating to 2014-2021) appeared in GPT-3's training data." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "Several benchmarks (GSM8K 2021, MultiArith 2016, etc.) were available online before GPT-3's training. No contamination analysis performed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No API costs or dollar amounts reported. Figure 7 shows performance vs. number of verification iterations P but not in terms of cost. The method requires K×P additional API calls per example but cost is not quantified." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total compute budget stated. The Reproducibility Statement (A.3) gives experiment dates but not compute costs or API spend." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Paper states experiments were run 3 times and averaged but does not report variance across runs or seed sensitivity." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Section 4.4: 'we ran each experiment three times and calculated the average result.'" 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No discussion of how K=5 and P=10 were selected. Figure 7 shows P sensitivity but no systematic search budget reported." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "Figure 7 shows performance across different P values and the paper recommends P=10 based on the performance plateau, showing all configurations." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "No statistical tests performed at all, let alone corrections for multiple comparisons across 8 datasets × multiple methods." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "Authors compare their self-verification method against their own implementations of baselines without acknowledging potential bias." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Figure 7 explicitly shows performance as a function of computational resource (number of P verification iterations) on GSM8K. Limitations section acknowledges increased computational cost." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether the 8 benchmarks adequately measure 'reasoning ability' as claimed. Benchmarks are used without questioning construct validity." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding involved. The method is a prompting strategy, not a scaffold comparison." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of temporal leakage. Several benchmarks (MultiArith 2016, SingleEq 2015, AddSub 2014) predate GPT-3's training by years." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether few-shot prompts or evaluation setup leaks information." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of independence between training data and test benchmarks." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention methods applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Self-verification improves CoT reasoning on Instruct-GPT by an average of 2.33% across 8 datasets", 365 "evidence": "Table 1 shows improvements ranging from +0.41% (CSQA) to +4.33% (GSM8K) with code-davinci-002.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "Self-verification can be combined with self-consistency and PAL for further gains", 370 "evidence": "Table 1 bottom section shows SC+SV and PAL+SV consistently improve over SC and PAL alone across datasets.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Self-verification is an emergent ability that improves with model scale", 375 "evidence": "Figure 3 shows self-verification hurts or has negligible effect on 0.4B-1.3B models but helps 7B and 175B models.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Condition Mask Verification outperforms True-False Item Verification on arithmetic tasks", 380 "evidence": "Figure 6 compares CMV vs TFV across 6 arithmetic datasets. CMV is generally better on Instruct-GPT, though TFV sometimes matches or beats CMV on GPT-3.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Multiple condition masks produce better verification scores than single condition masks", 385 "evidence": "Figure 5 shows multi-condition generally outperforms single-condition across 6 datasets, though single-condition sometimes underperforms CoT baseline.", 386 "supported": "moderate" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "No statistical significance testing", 392 "detail": "All improvements are reported as raw accuracy differences without any statistical tests. Many improvements are small (e.g., +0.06% to +0.46% for SC+SV on Instruct-GPT) and may not be statistically significant." 393 }, 394 { 395 "flag": "No variance reported despite multiple runs", 396 "detail": "Paper states experiments were averaged over 3 runs but never reports standard deviation, making it impossible to assess whether improvements exceed run-to-run variance." 397 }, 398 { 399 "flag": "Contamination risk unaddressed", 400 "detail": "Several benchmarks (MultiArith 2016, SingleEq 2015, AddSub 2014) were publicly available years before GPT-3 training. No contamination analysis performed." 401 }, 402 { 403 "flag": "Compute fairness not addressed", 404 "detail": "Self-verification uses K×(R×P+1) API calls per example (K=5 candidates, R conditions, P=10 repeats) but is compared against single-pass CoT. The compute-matched comparison is missing." 405 }, 406 { 407 "flag": "Limited model diversity", 408 "detail": "All experiments use only OpenAI GPT-3 variants. The title claims 'Large Language Models' broadly but no other model families were tested." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "Chain of thought prompting elicits reasoning in large language models", 414 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Ed Chi", "Quoc Le", "Denny Zhou"], 415 "year": 2022, 416 "arxiv_id": "2201.11903", 417 "relevance": "Foundational CoT prompting method that this paper builds upon and extends with backward verification." 418 }, 419 { 420 "title": "Self-consistency improves chain of thought reasoning in language models", 421 "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans", "Quoc V Le", "Ed H. Chi"], 422 "year": 2023, 423 "relevance": "Key baseline and complementary method; self-verification is shown to stack with self-consistency decoding." 424 }, 425 { 426 "title": "Training verifiers to solve math word problems", 427 "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian", "Jacob Hilton", "Reiichiro Nakano", "Christopher Hesse", "John Schulman"], 428 "year": 2021, 429 "arxiv_id": "2110.14168", 430 "relevance": "Proposes trained verifiers for math reasoning; this paper's self-verification aims to achieve similar without training." 431 }, 432 { 433 "title": "Let's verify step by step", 434 "authors": ["Hunter Lightman", "Vineet Kosaraju", "Yura Burda", "Harrison Edwards", "Bowen Baker", "Teddy Lee", "Jan Leike", "John Schulman", "Ilya Sutskever", "Karl Cobbe"], 435 "year": 2023, 436 "arxiv_id": "2305.20050", 437 "relevance": "Process-level verification for math reasoning using trained reward models; contrasts with this paper's training-free approach." 438 }, 439 { 440 "title": "PAL: Program-aided language models", 441 "authors": ["Luyu Gao", "Aman Madaan", "Shuyan Zhou", "Uri Alon", "Pengfei Liu", "Yiming Yang", "Jamie Callan", "Graham Neubig"], 442 "year": 2023, 443 "relevance": "Alternative forward reasoning method using programs; shown to be complementary with self-verification." 444 }, 445 { 446 "title": "Large language models are zero-shot reasoners", 447 "authors": ["Takeshi Kojima", "Shixiang Shane Gu", "Machel Reid", "Yutaka Matsuo", "Yusuke Iwasawa"], 448 "year": 2022, 449 "relevance": "Zero-shot CoT prompting demonstrating LLM reasoning capabilities without few-shot examples." 450 }, 451 { 452 "title": "Chain-of-verification reduces hallucination in large language models", 453 "authors": ["Shehzaad Dhuliawala", "Mojtaba Komeili", "Jing Xu", "Roberta Raileanu", "Xian Li", "Asli Celikyilmaz", "Jason Weston"], 454 "year": 2023, 455 "arxiv_id": "2309.11495", 456 "relevance": "Related verification approach for reducing LLM hallucinations through structured verification chains." 457 }, 458 { 459 "title": "Least-to-most prompting enables complex reasoning in large language models", 460 "authors": ["Denny Zhou", "Nathanael Schärli", "Le Hou", "Jason Wei", "Nathan Scales", "Xuezhi Wang", "Dale Schuurmans", "Claire Cui", "Olivier Bousquet", "Quoc Le", "Ed Chi"], 461 "year": 2023, 462 "relevance": "Decomposition-based prompting strategy for complex reasoning; mentioned as complementary to self-verification." 463 }, 464 { 465 "title": "Language models are few-shot learners", 466 "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"], 467 "year": 2020, 468 "relevance": "GPT-3 paper establishing few-shot in-context learning capabilities that this work builds upon." 469 }, 470 { 471 "title": "Evaluating large language models trained on code", 472 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 473 "year": 2021, 474 "arxiv_id": "2107.03374", 475 "relevance": "Codex paper; the code-davinci models used in this paper's experiments are based on this work." 476 } 477 ] 478 }