scan.json (19527B)
1 { 2 "paper": { 3 "title": "Dancing with Critiques: Enhancing LLM Reasoning with Stepwise Natural Language Self-Critique", 4 "authors": ["Yansi Li", "Jiahao Xu", "Tian Liang", "Xingyu Chen", "Zhiwei He", "Qiuzhi Liu", "Rui Wang", "Zhuosheng Zhang", "Zhaopeng Tu", "Haitao Mi", "Dong Yu"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2503.17363" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "GitHub link provided in abstract: https://github.com/puddingyeah/PANEL" 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "Uses publicly available benchmarks: AIME (from Art of Problem Solving wiki) and GPQA Diamond (Rein et al., 2024). No proprietary data collected." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section found in the paper. Only model names are mentioned." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions in the paper. Code repository is referenced but no README or reproduction guide is included in the paper itself." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "All results in Tables 1-2 and Figure 3 report only point estimates with no confidence intervals, error bars, or ± notation." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims PANEL 'outperforms' and 'surpasses' baselines based solely on comparing raw accuracy numbers without any statistical significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Percentage improvements are reported with baseline context, e.g., 'NL self-critique improves reasoning accuracy over solution-level self-evaluation by 2.2%' (Section 3.2), and absolute numbers are provided in tables allowing comparison." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "AIME uses only 45 problems (30 + 15) and GPQA Diamond has 198 questions. No justification for these sample sizes or discussion of statistical power given the small N." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviation, variance, or results across multiple runs reported. All results appear to be single-run numbers." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Multiple baselines included: Self-Consistency, Solution-Level Self-Evaluation, Solution-Level Self-Evaluation with NL Self-Critique, and Step-Level Self-Evaluation (Section 3.1)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include recent methods: Self-Consistency (Wang et al., 2023b), Step-Level Self-Evaluation (Xie et al., 2024). These are contemporary and relevant." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Ablation provided: PANEL with and without NL Self-Critique in Figure 3, and comparison of self-critique vs. external critique model in Table 2." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": false, 78 "justification": "Only accuracy is reported as the evaluation metric. No other metrics (e.g., computational efficiency comparisons, token usage) are used for evaluation." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation of the generated reasoning traces or critiques. All evaluation is automated via answer matching. Given the paper claims about 'rich, human-readable critiques,' human evaluation of critique quality would be relevant." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "AIME and GPQA Diamond are standard test sets. No tuning on these benchmarks is described; the method is inference-time only." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Table 1 provides per-domain breakdowns for GPQA (Biology, Chemistry, Physics) and per-year breakdowns for AIME (2024, 2025)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": false, 98 "justification": "Only success cases are shown (Figures 2 and 9). No analysis of where PANEL fails or produces worse results than baselines." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table 2 shows that external 70B critique underperforms self-critique (8B) for step-level evaluation on GPQA (32.3% vs 38.9%), which is a somewhat counterintuitive negative result." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims that PANEL 'significantly enhances reasoning performance, outperforming traditional scalar reward-based methods' are supported by Tables 1 results showing improvements on both benchmarks." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims ('NL self-critique improves reasoning accuracy') are supported by controlled ablation: PANEL with vs. without self-critique (Figure 3), which is a single-variable manipulation." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title and abstract claim to enhance 'LLM Reasoning' broadly, but results are only on two Llama models and two STEM reasoning benchmarks. No testing on non-STEM reasoning, other model families, or other task types." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No discussion of alternative explanations for the improvements. For example, the additional inference compute from generating critiques is not controlled for — the improvement could be from additional compute rather than critique quality." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Specific model versions stated: 'Llama 3.1-8B-Instruct' and 'Llama 3.3-70B-Instruct' (Section 3.1). These are specific enough to identify the exact models." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Full prompt text for NL self-critique provided in Appendix A (Figures 5-8) for Math, Physics, Chemistry, and Biology domains." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "Temperature is stated as 0.6 (footnote 1) and K=5 candidates. However, other hyperparameters like top-p, max tokens, and search depth/budget are not reported." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The three-stage PANEL framework (Sampling Candidates, NL Self-Critique, Decision Making) is described in detail in Section 2, including the search algorithm and decision-making process." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": false, 152 "justification": "No description of how AIME problems were formatted for input, how GPQA Diamond questions were processed, or any preprocessing steps." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "A dedicated 'Limitations' section is present after Section 5 (Conclusion), discussing theoretical understanding gaps and challenges in quantifying information richness." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "The limitations section discusses only general theoretical gaps ('a deeper theoretical understanding... is still needed') and methodological challenges of comparing information richness. No specific threats like small sample sizes on AIME or compute fairness are mentioned." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "No explicit statements about what the results do NOT show. The paper does not bound its claims to STEM reasoning with Llama models, nor state what was not tested." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw experimental outputs, model responses, or per-problem results are made available. Only aggregate accuracy numbers are reported." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Benchmarks are described: AIME from American Invitational Mathematics Examination (2024 and 2025-Part1), GPQA Diamond from Rein et al. (2024) with 198 questions across three domains." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. Data sources are standard benchmarks." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": false, 191 "justification": "No documentation of how benchmark problems are fed into the system, how outputs are parsed and evaluated, or intermediate processing steps." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section found in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations clearly listed: Tencent and Shanghai Jiao Tong University. Internship relationships also disclosed." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding disclosed, so independence cannot be assessed. Authors are from Tencent, which has commercial interest in LLM reasoning capabilities, but this potential conflict is not discussed." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement found in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper uses Llama 3.1 and 3.3 models on AIME and GPQA benchmarks but does not state the training data cutoff dates for these models." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether AIME 2024 or GPQA Diamond problems could have been in the Llama training data. AIME 2025 Part 1 is likely post-training-cutoff, but this is not discussed." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "GPQA Diamond was published in 2024 and could be in Llama 3.3 training data. AIME problems are publicly available. No contamination analysis is provided." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "PANEL generates critiques for each candidate at each step, significantly increasing inference cost. No cost, latency, or token consumption numbers are reported." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No information on total compute used, GPU hours, or hardware specifications." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "PANEL outperforms traditional scalar reward-based methods on AIME and GPQA Diamond benchmarks.", 286 "evidence": "Table 1 shows PANEL achieving 24.4% on AIME (vs 17.8-22.2% baselines) and 54.5% on GPQA (vs 51.0-52.5% baselines) with Llama3.3-70B. Section 3.2.", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "NL self-critique improves reasoning accuracy in both solution-level and step-level self-evaluation methods.", 291 "evidence": "Section 3.2: NL self-critique improves solution-level self-evaluation by 2.2% (8B) and 0.5% (70B) on GPQA. Step-level improvement of 6.6% on AIME and 3.0% on GPQA with 70B model.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "Self-generated NL critique from the policy model is more effective than external larger model critique for step-level evaluation.", 296 "evidence": "Table 2: 8B self-critique achieves 38.9% vs 32.3% for external 70B critique on GPQA in step-level setting.", 297 "supported": "weak" 298 }, 299 { 300 "claim": "PANEL scales effectively with increased computational resources (pass@k analysis).", 301 "evidence": "Figure 3 shows PANEL consistently outperforming baseline across k=1,2,4,8 for both model sizes.", 302 "supported": "moderate" 303 } 304 ], 305 "methodology_tags": ["benchmark-eval"], 306 "key_findings": "PANEL introduces natural language self-critique as a replacement for scalar process reward models in step-level tree search for LLM reasoning. On AIME and GPQA Diamond benchmarks with Llama 3.1-8B and 3.3-70B, PANEL outperforms self-consistency and self-evaluation baselines. Self-generated critiques from the policy model itself outperform critiques from a larger external model for step-level evaluation, and NL self-critique primarily influences early reasoning steps.", 307 "red_flags": [ 308 { 309 "flag": "No uncertainty quantification", 310 "detail": "All results are single-run point estimates on small benchmarks (45 AIME problems, 198 GPQA questions) with no error bars, confidence intervals, or multiple-run statistics. Performance differences could be within noise." 311 }, 312 { 313 "flag": "Compute fairness not addressed", 314 "detail": "PANEL generates additional critique text and decision-making text for each candidate at each step, substantially increasing inference compute. Baselines are not given equivalent additional compute, making the comparison potentially unfair." 315 }, 316 { 317 "flag": "Very small benchmark for key claims", 318 "detail": "AIME has only 45 problems total (30+15). A single problem is worth ~2.2% accuracy. Performance differences of 2-6% on AIME could represent 1-3 problems, making claims of 'significant improvement' on AIME unreliable." 319 }, 320 { 321 "flag": "No contamination analysis", 322 "detail": "AIME problems and GPQA Diamond may be in Llama training data. No discussion of potential contamination despite using publicly available benchmarks with models that may have been trained on them." 323 } 324 ], 325 "cited_papers": [ 326 { 327 "title": "Scaling test-time compute optimally can be more effective than scaling LLM parameters", 328 "authors": ["Charlie Victor Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"], 329 "year": 2025, 330 "relevance": "Core related work on inference-time compute scaling, directly relevant to survey scope on LLM reasoning capabilities." 331 }, 332 { 333 "title": "Self-consistency improves chain of thought reasoning in language models", 334 "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"], 335 "year": 2023, 336 "relevance": "Key baseline method for LLM reasoning via sampling multiple reasoning paths." 337 }, 338 { 339 "title": "Self-evaluation guided beam search for reasoning", 340 "authors": ["Yuxi Xie", "Kenji Kawaguchi", "Yiran Zhao"], 341 "year": 2024, 342 "relevance": "Step-level self-evaluation baseline directly compared against in experiments." 343 }, 344 { 345 "title": "Tree of thoughts: Deliberate problem solving with large language models", 346 "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"], 347 "year": 2024, 348 "relevance": "Foundational work on tree-structured reasoning for LLMs." 349 }, 350 { 351 "title": "Let's verify step by step", 352 "authors": ["Hunter Lightman", "Vineet Kosaraju", "Yuri Burda"], 353 "year": 2024, 354 "relevance": "Process reward models for step-level verification, the approach PANEL aims to replace." 355 }, 356 { 357 "title": "Reflexion: Language agents with verbal reinforcement learning", 358 "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath"], 359 "year": 2024, 360 "relevance": "Related work on using verbal/natural language feedback for LLM self-improvement." 361 }, 362 { 363 "title": "CRITIC: Large language models can self-correct with tool-interactive critiquing", 364 "authors": ["Zhibin Gou", "Zhihong Shao", "Yeyun Gong"], 365 "year": 2024, 366 "relevance": "Related work on LLM self-critique and self-correction capabilities." 367 }, 368 { 369 "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning", 370 "authors": ["Daya Guo"], 371 "year": 2025, 372 "arxiv_id": "2501.12948", 373 "relevance": "Contemporary work on enhancing LLM reasoning capabilities via RL." 374 }, 375 { 376 "title": "Chain-of-thought prompting elicits reasoning in large language models", 377 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 378 "year": 2022, 379 "relevance": "Foundational work on chain-of-thought reasoning in LLMs." 380 }, 381 { 382 "title": "Generative verifiers: Reward modeling as next-token prediction", 383 "authors": ["Lunjun Zhang", "Arian Hosseini", "Hritik Bansal"], 384 "year": 2024, 385 "relevance": "Related approach using generative models as verifiers, relevant to LLM evaluation methodology." 386 } 387 ] 388 }