scan-v5.json (26340B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Large Language Models are Better Reasoners with Self-Verification", 6 "authors": [ 7 "Yixuan Weng", 8 "Minjun Zhu", 9 "Fei Xia", 10 "Bin Li", 11 "Shizhu He" 12 ], 13 "year": 2022, 14 "venue": "Conference on Empirical Methods in Natural Language Processing", 15 "arxiv_id": "2212.09561", 16 "doi": "10.18653/v1/2023.findings-emnlp.167" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "The abstract claims improvements on arithmetic, commonsense, and logical reasoning datasets, with specific figures (60.8→65.1 on GSM8K, 91.01→93.40 on SingleEq) that match Table 1 results exactly.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "The paper makes causal claims that self-verification improves reasoning; the study design is adequate — it isolates the verification step as the single variable against a CoT baseline using the same model and prompts.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The title and conclusions claim broadly that 'LLMs are Better Reasoners with Self-Verification,' but experiments are confined to two GPT-3 family models (code-davinci-001 and code-davinci-002) from OpenAI; generalization to other LLM families is not demonstrated.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper does not discuss alternative explanations for why self-verification works, such as whether the benefit is purely from additional compute/sampling (vs. the backward reasoning structure itself); the comparison to self-consistency partially addresses this but without explicit framing.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper measures 'problem solve rate' (exact answer accuracy) directly against ground truth, with no proxy — claims of improved reasoning are bounded to this direct accuracy measure.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "A dedicated Limitations section is present after the Conclusion, discussing prompt bias, dependence on candidate quality, model size requirements, scope of evaluation, and computational cost.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "The limitations are specific: prompts may introduce bias, the method requires a correct answer to already exist among candidates, smaller models (0.4B, 1.3B) see negative or negligible benefit, and the method does not evaluate the reasoning process itself.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper explicitly states the method is 'more suitable for arithmetic reasoning tasks than other reasoning tasks' and that 'it is challenging to augment the reasoning performance of smaller language models,' providing meaningful scope boundaries.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Acknowledgments disclose National Key R&D Program of China (No.2022ZD0118501), NSFC grants, Strategic Priority Research Program of CAS, Youth Innovation Promotion Association CAS, and OPPO Research Fund.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are listed: CAS Laboratory of Cognition and Decision Intelligence, UCAS, Hunan University, and Unisound Beijing.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": true, 87 "justification": "Funders are Chinese government agencies and OPPO (smartphone company); neither has a direct financial stake in whether GPT-3/InstructGPT reasoning benchmarks improve, making them sufficiently independent of the outcome.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement is present; the paper only lists funding sources in acknowledgments without declaring patents, equity, or consulting relationships.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Key terms are defined: 'self-verification' is defined operationally through forward reasoning and backward verification steps; 'Condition Mask Verification' and 'True-False Item Verification' are both formally described with examples.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Three explicit contributions are enumerated: (1) proving LLMs can self-verify without training additional verifiers, (2) multi-dataset empirical results, (3) two verification methods (CMV and TFV).", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "The related work section engages substantively with CoT prompting, self-consistency, in-context learning, and trained verifier approaches (Cobbe et al., Shen et al., Li et al.), explaining how this work eliminates the need for annotations and extra models.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "Code is publicly released at https://github.com/WENGSYX/Self-Verification, stated in the abstract.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "All 8 benchmark datasets are standard publicly available benchmarks with explicit URLs provided in Appendix A.2 (GSM8K, SingleEq, AddSub, MultiArith, AQUA-RAT, SVAMP, CSQA, BIG-bench Date Understanding).", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "No requirements.txt, Dockerfile, or formal dependency specification is provided; the paper mentions using NumPy and the OpenAI API but gives no reproducible environment specification.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": true, 142 "justification": "Appendix provides complete prompts for all datasets (Tables 8–17), hyperparameters (K=5, P=10, max_tokens=168), answer cleansing pseudo-code (Table 3), and API run dates (Appendix A.3), enabling reconstruction of experiments.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "No confidence intervals or error bars are reported anywhere in the tables; results are averages of 3 runs but spread is not shown.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are applied to any comparative claim; improvements as small as +0.07 (SingleEq with SC) are presented without testing.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Percentage-point improvements are reported inline (e.g., +4.33 on GSM8K, +2.39 on SingleEq) with baseline context throughout Table 1.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "No power analysis or justification of dataset sample sizes is provided; dataset statistics are listed in Table 6 but without discussion of adequacy.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "The paper states experiments were run three times and averages reported, but standard deviations or ranges across runs are never reported.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "CoT baseline is included for all datasets; self-consistency (SC) and PAL are also used as baselines for combining experiments.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "Baselines include CoT (Wei et al., 2022), self-consistency (Wang et al., 2023), and PAL (Gao et al., 2023), all contemporary methods from the same era.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "Figure 5 ablates single vs. multiple condition masks; Figure 6 compares CMV vs. TFV methods; Figure 7 ablates the number of verification samples P.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": false, 200 "justification": "Only 'problem solve rate' (exact accuracy) is reported; no additional metrics such as efficiency, calibration, or partial-credit scoring are included.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": false, 205 "answer": false, 206 "justification": "The paper evaluates on standard mathematical and reasoning benchmarks with deterministic ground-truth answers; human evaluation is not relevant.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "Standard benchmark test sets are used; all 8 datasets have held-out test sets with ground-truth labels not used in constructing prompts.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Results are broken down per dataset and grouped by task category (arithmetic, commonsense, logical reasoning) throughout Table 1 and Figures 3–6.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Table 7 in the appendix provides explicit examples of verification failures (marked [✗] with ground truth) across all 8 datasets, showing where self-verification produces wrong answers.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "Figure 3b explicitly shows self-verification hurts smaller models (0.4B) on some tasks (pink area indicates negative impact), and the paper discusses this limitation.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Exact model identifiers are specified: 'code-davinci-001' (GPT-3 175B) and 'code-davinci-002' (InstructGPT 175B), and Section 4.2 references 'text-ada-001', 'text-babbage-001', 'text-curie-001' for size comparisons.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": true, 244 "justification": "Complete few-shot prompts for all 8 datasets for both forward reasoning (Tables 8–11) and backward verification (Tables 12–17) are provided in the appendix.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Section 4.4 specifies K=5 candidate answers, P=10 verification iterations, max token length of 168, and sampling decoding without top-k truncation.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": true, 255 "answer": true, 256 "justification": "The full two-step pipeline (forward reasoning → backward verification → score-based selection) is described in detail in Section 3, including the rewriting step and score calculation formula.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Table 3 provides explicit answer cleansing strategies with pseudo-code for all answer formats (number, multiple choice, true/false, yes/no, free format).", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "Raw model outputs (API responses for each example) are not released; only aggregated accuracy numbers are reported in tables.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "All datasets are standard benchmarks with public URLs and statistics (Table 6); no novel data collection was performed, so the provenance is well-documented via citations.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "Only standard benchmarks are used; no human participant recruitment occurred.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "The full pipeline from input to final answer is documented: prompt construction, sampling decoding, backward verification, score calculation, and answer cleansing are all described with formulas and pseudo-code.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "The paper does not state the training data cutoffs for code-davinci-001 or code-davinci-002; it only notes API runs were conducted November–December 2022.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "The paper does not discuss potential overlap between GPT-3's training data and the benchmark test sets (GSM8K, CSQA, etc.), which were publicly available before training cutoffs.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "Benchmark contamination is not addressed; all 8 benchmarks (including GSM8K released October 2021) were publicly available before GPT-3's likely training cutoff.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants; NA.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants; NA.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants; NA.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants; NA.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants; NA.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants; NA.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants; NA.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": false, 360 "justification": "Figure 7 shows performance vs. number of verification samples P qualitatively, but no absolute cost, latency, or token counts are reported.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "No total computational budget (API calls, cost, GPU hours) is stated; only the API run date range is mentioned in the reproducibility statement.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "LLMs can self-verify their reasoning outputs without additional training or annotation, using only few-shot prompts", 375 "evidence": "CoT+Self-Verification outperforms plain CoT on all 8 datasets for both code-davinci-001 and code-davinci-002 (Table 1)", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Self-verification improves arithmetic reasoning more than commonsense or logical reasoning", 380 "evidence": "Average improvement for arithmetic tasks is 1.67%/2.84% vs. 0.62%/0.78% for other tasks (Section 5, Table 1)", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Self-verification is an emergent property of larger models; smaller models (<7B) see little or negative benefit", 385 "evidence": "Figure 3b shows negative impact for 0.4B model on GSM8K and MultiArith; benefit increases monotonically with model size", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Condition Mask Verification (CMV) outperforms True-False Item Verification (TFV) on arithmetic tasks", 390 "evidence": "Figure 6 shows CMV consistently above TFV for InstructGPT and GPT-3 on 6 arithmetic datasets", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Self-verification provides additional gains on top of self-consistency and PAL baselines", 395 "evidence": "Table 1 shows SC+SV and PAL+SV outperform SC and PAL alone across datasets, though improvements are often <1pp for SC combinations", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "Self-verification achieves new SOTA on 6 of 8 benchmark datasets", 400 "evidence": "Table 1 compares against fine-tuned SOTA baselines; the claim is technically supported but conflates few-shot vs. fine-tuned paradigms", 401 "supported": "weak" 402 } 403 ], 404 "methodology_tags": [ 405 "benchmark-eval" 406 ], 407 "key_findings": "The paper proposes a two-stage self-verification method where LLMs generate multiple candidate answers via Chain-of-Thought, then verify them by masking original conditions and testing whether the model can recover them from the candidate conclusion. The method achieves consistent accuracy improvements on all 8 tested reasoning benchmarks without any additional training or annotation. Gains are larger for arithmetic tasks (CMV method) than for commonsense/logical tasks (TFV method), and the benefit scales with model size, appearing to be an emergent capability. Combining self-verification with self-consistency or PAL yields further, smaller improvements.", 408 "red_flags": [ 409 { 410 "flag": "No variance reporting", 411 "detail": "Three runs are averaged but no standard deviations are reported, making it impossible to assess whether small improvements (e.g., +0.07pp, +0.10pp) are meaningful." 412 }, 413 { 414 "flag": "Deprecated models, limited reproducibility", 415 "detail": "code-davinci-001 and code-davinci-002 are no longer accessible via the OpenAI API, making exact reproduction impossible despite detailed prompts." 416 }, 417 { 418 "flag": "Benchmark contamination unaddressed", 419 "detail": "All 8 benchmarks were publicly available before GPT-3's training cutoff; the paper does not discuss potential test-set memorization as a confound." 420 }, 421 { 422 "flag": "SOTA comparison conflates paradigms", 423 "detail": "The paper compares few-shot self-verification against fine-tuned SOTA baselines (e.g., GSM8K fine-tuned + verifier), presenting this as a fair SOTA comparison without acknowledging the paradigm difference." 424 }, 425 { 426 "flag": "Marginal improvements without significance tests", 427 "detail": "Many SC+SV improvements are below 0.5pp (e.g., +0.06 on AddSub, +0.10 on CSQA) but are presented uniformly as improvements without statistical testing." 428 } 429 ], 430 "cited_papers": [ 431 { 432 "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models", 433 "relevance": "Foundation for the forward reasoning step; self-verification is presented as a complement to CoT" 434 }, 435 { 436 "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models", 437 "relevance": "Primary baseline; self-verification is compared against and combined with self-consistency" 438 }, 439 { 440 "title": "PAL: Program-Aided Language Models", 441 "relevance": "Secondary baseline for forward reasoning; combined with self-verification in experiments" 442 }, 443 { 444 "title": "Training Verifiers to Solve Math Word Problems", 445 "relevance": "Key prior work on answer verification requiring additional training; self-verification is explicitly positioned against this approach" 446 }, 447 { 448 "title": "Let's Verify Step by Step", 449 "relevance": "Related work on process-level verification; cited as requiring fine-tuning, which self-verification avoids" 450 }, 451 { 452 "title": "Large Language Models are Zero-Shot Reasoners", 453 "relevance": "Related prompting work (zero-shot CoT) situating the few-shot prompting context of this paper" 454 }, 455 { 456 "title": "On the Advance of Making Language Models Better Reasoners", 457 "relevance": "Closely related work on improving LLM reasoning; cited as training-based comparison method" 458 } 459 ], 460 "engagement_factors": { 461 "practical_relevance": { 462 "score": 2, 463 "justification": "The no-training, few-shot approach is immediately applicable to practitioners using any capable LLM API, though the specific models tested are now deprecated." 464 }, 465 "surprise_contrarian": { 466 "score": 2, 467 "justification": "The finding that a model can reliably verify its own outputs — without a separate verifier model — challenges the intuition that self-evaluation is circular or unreliable." 468 }, 469 "fear_safety": { 470 "score": 0, 471 "justification": "The paper raises no safety or risk concerns; it is focused on benchmark accuracy improvement." 472 }, 473 "drama_conflict": { 474 "score": 0, 475 "justification": "No controversy or conflict with other labs or claims; straightforward benchmark improvement paper." 476 }, 477 "demo_ability": { 478 "score": 2, 479 "justification": "The method can be tried with current GPT-4 or GPT-3.5-turbo using the published prompts, though exact replication requires deprecated models." 480 }, 481 "brand_recognition": { 482 "score": 1, 483 "justification": "Chinese Academy of Sciences is a recognized institution; tests GPT-3 (well-known), but no major Western lab branding." 484 } 485 }, 486 "hn_data": { 487 "threads": [ 488 { 489 "hn_id": "33996055", 490 "title": "Generic Tagging for RISC-V Binaries", 491 "points": 3, 492 "comments": 0, 493 "url": "https://news.ycombinator.com/item?id=33996055", 494 "created_at": "2022-12-15T06:35:27Z" 495 }, 496 { 497 "hn_id": "43321245", 498 "title": "Introduction to Online Control", 499 "points": 2, 500 "comments": 0, 501 "url": "https://news.ycombinator.com/item?id=43321245", 502 "created_at": "2025-03-10T14:46:05Z" 503 }, 504 { 505 "hn_id": "33699117", 506 "title": "Where Did My Variable Go? Poking Holes in Incomplete Debug Information", 507 "points": 2, 508 "comments": 0, 509 "url": "https://news.ycombinator.com/item?id=33699117", 510 "created_at": "2022-11-21T22:37:23Z" 511 }, 512 { 513 "hn_id": "33355436", 514 "title": "Challenging Big-Bench Tasks and Whether Chain-of-Thought Can Solve Them", 515 "points": 1, 516 "comments": 1, 517 "url": "https://news.ycombinator.com/item?id=33355436", 518 "created_at": "2022-10-27T10:01:21Z" 519 }, 520 { 521 "hn_id": "43324715", 522 "title": "Breakthrough on Kakeya Problem: Sticky Kakeya Conjecture Proven in 3 Dimensions (2022)", 523 "points": 1, 524 "comments": 0, 525 "url": "https://news.ycombinator.com/item?id=43324715", 526 "created_at": "2025-03-10T19:17:15Z" 527 }, 528 { 529 "hn_id": "35151086", 530 "title": "Token Merging: Your ViT but Faster", 531 "points": 1, 532 "comments": 0, 533 "url": "https://news.ycombinator.com/item?id=35151086", 534 "created_at": "2023-03-14T13:25:09Z" 535 }, 536 { 537 "hn_id": "29757897", 538 "title": "A new PPE items Dataset", 539 "points": 1, 540 "comments": 0, 541 "url": "https://news.ycombinator.com/item?id=29757897", 542 "created_at": "2022-01-01T12:52:35Z" 543 } 544 ], 545 "top_points": 3, 546 "total_points": 11, 547 "total_comments": 1 548 } 549 }