scan.json (19014B)
1 { 2 "paper": { 3 "title": "CoT-based Synthesizer: Enhancing LLM Performance through Answer Synthesis", 4 "authors": ["Bohan Zhang", "Xiaokang Zhang", "Jing Zhang", "Jifan Yu", "Sijia Luo", "Jie Tang"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2501.01668" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "GitHub repository URL provided: https://github.com/RUCKBReasoning/CoT-based-Synthesizer (Section 1, footnote 1)." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses public benchmarks (GSM8k, MATH500, WikiTQ, FeTaQA). The abstract states 'training data and code are publicly available on the repository.' Appendix A mentions release under CC BY-SA 4.0 upon acceptance." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": true, 24 "justification": "Appendix C.1 specifies Ubuntu 22.04, PyTorch 2.4.0, 8 NVIDIA A800 80GB GPUs, Intel Xeon Platinum 8358, 2048GB RAM, and mentions Transformers and vLLM." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repo is referenced but the paper itself does not include a 'Reproducing Results' section or specific commands." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Despite averaging over 3 runs, no confidence intervals or error bars are reported. All tables show point estimates only." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims their method 'significantly improves' and 'outperforms' baselines but provides no statistical significance tests (no p-values, t-tests, etc.)." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Percentage improvements over baselines are reported with context, e.g., '11.8% for Llama3-8B and 10.3% for GPT-4o on MATH500' and detailed improvement columns in Tables 2 and 4." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification for why 3 runs were chosen, no power analysis. The choice of benchmarks and number of policy models is not justified." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "Each experiment is 'conducted three times' and 'reported results are the average' (Section 5.1), but no standard deviation, variance, or spread measures are reported." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Multiple baselines included: CoT-prompting, Self-consistency, USC, ArmoRM, Scalar RM, and LMCOR (Section 5.1)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include recent methods: ArmoRM (2024), USC (2023), LMCOR (2023), using recent models like GPT-4o and Qwen2.5." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Section 5.3 ablates CoT training and the data generation pipeline. Appendix B ablates LLM Repair and Response LLM Sampling (Tables 2 and 4)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Exact Match (EM) accuracy for GSM8k, MATH500, WikiTQ, and Rouge-L for FeTaQA (Section 5.1)." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation of synthesized outputs. All evaluation is automated via EM or Rouge-L." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "Training is on MATH and WikiTQ training sets; evaluation is on separate test sets. GSM8k and FeTaQA are unseen during training, serving as transfer tests." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Table 1 provides per-benchmark, per-model breakdowns. Table 3 provides per-correct-count breakdowns." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 3 analyzes performance by number of correct candidates. Figures 5 and 6 show qualitative examples. The Limitations section discusses failure modes with grouping." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table 2 shows ablation variants that hurt performance (w/o training degrades on GSM8k by -1.8 avg; w/o CoT training degrades on some models). Some models show negative deltas." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims of 11.8% for Llama3-8B and 10.3% for GPT-4o on MATH500 are supported by Table 1 (24.2→36.0 and 62.5→72.8). Claims of outperforming SC and BoN are supported." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims about component contributions are supported by controlled ablation studies (Tables 2 and 4) that isolate individual components." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title and abstract claim general 'LLM Performance' enhancement, but results are limited to mathematical reasoning and table QA tasks. No explicit bounding of generalization scope." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No discussion of alternative explanations for the improvements. No threats-to-validity section addressing confounds." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Table 6 lists specific model versions with HuggingFace URLs: GPT-4o-2024-0513, Llama3-8B-Instruct, Llama-3.1-70B-Instruct, Qwen2-7B-Instruct, Qwen2.5-14B-Instruct, etc." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Appendix E provides full prompt text for Synthesizer inference, LLM Repair, and CoT-prompting for each dataset (MATH500, GSM8k, TableQA)." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Table 5 reports temperature=0.9, Top-P=0.9, Max Tokens=1024. Appendix C.3 reports LR=2e-6, weight decay=1e-2, batch size=128, 2 epochs, BF16, max length 4096." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. The method is a single-pass synthesis pipeline, not an agent with tools, retries, or feedback loops." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 4.2 describes the data pipeline: candidate generation, filtering via gold answer comparison (math) or CritiqueLLM scoring ≥8 (TableQA), LLM Repair stage. Dataset sizes documented in Table 7." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "A dedicated 'Limitations' section discusses grouping constraints due to input length and inference overhead." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "The Limitations section discusses only practical limitations (input length, inference overhead), not threats to validity of the experimental conclusions." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "No explicit statement about what the results do NOT show. The paper does not bound its claims to the tested task types (math, table QA) or model families." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "Benchmarks used (GSM8k, MATH, WikiTQ, FeTaQA) are publicly available. Training data and code are stated to be released on GitHub." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 4.2 describes the full data generation pipeline including sampling parameters, filtering criteria, and repair process. Table 7 shows dataset sizes." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. Data comes from standard benchmarks." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 4.2 and Appendix C.3 document the pipeline: 50 samplings per MATH sample, filtering by gold answer, LLM Repair with 20 additional samplings. 12k→295k MATH, 18k→87k WikiTQ." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Acknowledgments section lists National Key Research & Develop Plan (2023YFF0725100) and NSFC grants." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations clearly listed: Renmin University of China, Tsinghua University. One author interned at Zhipu AI (footnote)." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "Funding is from Chinese government research programs (NSFC, National Key R&D Plan), which have no financial stake in the specific results." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement. One author interned at Zhipu AI (which produces GLM-4-Plus, one of the evaluated models), but no conflict disclosure." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "No training cutoff dates stated for any models used (GPT-4o, GLM-4-Plus, Llama, Qwen)." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether the policy models may have seen the benchmark test data during pretraining." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "GSM8k (2021), MATH (2021), WikiTQ (2015), FeTaQA (2022) are all publicly available before the training cutoffs of the models used. No contamination discussion." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "The Limitations section acknowledges inference overhead but does not quantify it. No API costs, tokens consumed, or wall-clock time reported." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "Hardware is specified (8 A800 GPUs) but no total GPU hours, training time, or API costs are reported." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "CoT-based Synthesizer achieves 11.8% improvement for Llama3-8B and 10.3% for GPT-4o on MATH500", 286 "evidence": "Table 1: Llama3-8B goes from 24.2 (CoT-prompting) to 36.0 (Synthesizer-8B); GPT-4o from 62.5 to 72.8.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "The method can synthesize correct answers even when all candidate responses are incorrect", 291 "evidence": "Table 3 shows Synthesizer-8B produces 9 correct answers when correct count=0, while SC/ArmoRM/Scalar RM produce 0. Figure 5 provides a qualitative example.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Synthesizer-8B generalizes to unseen datasets (GSM8k, FeTaQA not in training data)", 296 "evidence": "Table 1 shows improvements on GSM8k (+3.4 avg) and FeTaQA (+3.2 avg) despite not being in training data.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "The method consistently outperforms baselines across all benchmarks", 301 "evidence": "Table 1 shows the method achieves highest or second-highest average across all 4 benchmarks, but does not always win per individual model.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Performance improves in a log-linear relationship with training data size", 306 "evidence": "Figure 3 shows the trend on MATH and GSM8k, but only visual evidence from a figure with no formal fit statistics.", 307 "supported": "weak" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "CoT-based Synthesizer is a novel inference scaling strategy that synthesizes answers from multiple LLM candidate responses using chain-of-thought reasoning, rather than selecting from candidates. A small 8B-parameter synthesizer trained on automatically generated data improves performance of larger models including GPT-4o across math reasoning and table QA benchmarks. The method uniquely produces correct answers even when all candidates are wrong (9/500 cases on MATH500 with Llama3-8B). Performance scales log-linearly with training data and consistently with number of inference candidates, unlike reward-model-based methods that degrade at high candidate counts.", 312 "red_flags": [ 313 { 314 "flag": "No variance despite multiple runs", 315 "detail": "Each experiment is averaged over 3 runs but no standard deviation or confidence intervals are reported, making it impossible to assess whether differences between methods are statistically meaningful." 316 }, 317 { 318 "flag": "No contamination analysis", 319 "detail": "All benchmarks (GSM8k 2021, MATH 2021, WikiTQ 2015, FeTaQA 2022) predate the models used. No discussion of whether models saw test data during pretraining." 320 }, 321 { 322 "flag": "Undisclosed conflict of interest", 323 "detail": "One author interned at Zhipu AI, which produces GLM-4-Plus (one of the evaluated models). No competing interests statement is provided." 324 }, 325 { 326 "flag": "Overbroad generalization claims", 327 "detail": "Title and abstract claim general 'LLM Performance' enhancement but results are limited to math reasoning and table QA. No testing on code generation, open-ended QA, summarization, or other tasks." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "Chain-of-thought prompting elicits reasoning in large language models", 333 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 334 "year": 2022, 335 "relevance": "Foundational work on chain-of-thought prompting that this paper builds upon for its synthesis strategy." 336 }, 337 { 338 "title": "Self-consistency improves chain of thought reasoning in language models", 339 "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"], 340 "year": 2022, 341 "arxiv_id": "2203.11171", 342 "relevance": "Key baseline method for inference scaling via majority voting on multiple LLM outputs." 343 }, 344 { 345 "title": "Universal self-consistency for large language model generation", 346 "authors": ["Xinyun Chen"], 347 "year": 2023, 348 "arxiv_id": "2311.17311", 349 "relevance": "Extends self-consistency to open-ended tasks using LLM-based voting; direct baseline in this paper." 350 }, 351 { 352 "title": "Small language models improve giants by rewriting their outputs", 353 "authors": ["Giorgos Vernikos"], 354 "year": 2023, 355 "arxiv_id": "2305.13514", 356 "relevance": "LMCOR synthesis baseline; demonstrates small models can improve larger model outputs." 357 }, 358 { 359 "title": "LLM-blender: Ensembling large language models with pairwise ranking and generative fusion", 360 "authors": ["Dongfu Jiang", "Xiang Ren", "Bill Yuchen Lin"], 361 "year": 2023, 362 "arxiv_id": "2306.02561", 363 "relevance": "Related ensemble method for combining LLM outputs via ranking and fusion." 364 }, 365 { 366 "title": "Training verifiers to solve math word problems", 367 "authors": ["Karl Cobbe"], 368 "year": 2021, 369 "arxiv_id": "2110.14168", 370 "relevance": "Introduces GSM8k benchmark and outcome-based verification for math reasoning; baseline method in this paper." 371 }, 372 { 373 "title": "An empirical analysis of compute-optimal inference for problem-solving with language models", 374 "authors": ["Yangzhen Wu"], 375 "year": 2024, 376 "arxiv_id": "2408.00724", 377 "relevance": "Studies compute-optimal inference scaling, directly relevant to the paper's inference scaling approach." 378 }, 379 { 380 "title": "Large language monkeys: Scaling inference compute with repeated sampling", 381 "authors": ["Bradley Brown"], 382 "year": 2024, 383 "arxiv_id": "2407.21787", 384 "relevance": "Studies scaling inference compute via repeated sampling, the paradigm this paper extends." 385 }, 386 { 387 "title": "Interpretable preferences via multi-objective reward modeling and mixture-of-experts", 388 "authors": ["Haoxiang Wang"], 389 "year": 2024, 390 "arxiv_id": "2406.12845", 391 "relevance": "ArmoRM reward model used as a Best-of-N baseline in this paper." 392 } 393 ] 394 }