scan.json (22388B)
1 { 2 "scan_version": 2, 3 "active_modules": ["experimental_rigor", "data_leakage"], 4 "paper": { 5 "title": "Improving LLM Reasoning with Multi-Agent Tree-of-Thought Validator Agent", 6 "authors": ["Fatemeh Haji", "Mazal Bethany", "Maryam Tabar", "Cho-Yu Jason Chiang", "Anthony Rios", "Peyman Najafirad"], 7 "year": 2024, 8 "venue": "1st Workshop on Safe and Trustworthy Agents @NeurIPS 2024", 9 "arxiv_id": "2409.11527", 10 "doi": "10.48550/arXiv.2409.11527" 11 }, 12 "checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": true, 17 "justification": "GitHub URL provided in abstract: https://github.com/SecureAIAutonomyLab/MA-ToT" 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "GSM8K is a publicly available benchmark dataset. The paper uses a random subset of 500 samples from it." 23 }, 24 "environment_specified": { 25 "applies": true, 26 "answer": false, 27 "justification": "Hardware is mentioned (4x NVIDIA DGX A100 80GB) but no requirements.txt, Dockerfile, or library version details are provided." 28 }, 29 "reproduction_instructions": { 30 "applies": true, 31 "answer": false, 32 "justification": "No step-by-step reproduction instructions are provided in the paper. A GitHub link is given but the paper itself contains no README-level reproduction guidance." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": true, 38 "answer": false, 39 "justification": "Table 1 reports only point estimates (e.g., 84.2%) with no confidence intervals or error bars." 40 }, 41 "significance_tests": { 42 "applies": true, 43 "answer": false, 44 "justification": "The paper claims 'outperforms' based on comparing raw accuracy numbers without any statistical significance tests." 45 }, 46 "effect_sizes_reported": { 47 "applies": true, 48 "answer": true, 49 "justification": "The paper reports percentage point improvements with baseline context, e.g., '8.8 percentage points over ToT for GPT-3.5-turbo (from 75.4% to 84.2%)'." 50 }, 51 "sample_size_justified": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper uses 500 samples from GSM8K with no justification for why 500 was chosen or whether this is sufficient for the claimed comparisons." 55 }, 56 "variance_reported": { 57 "applies": true, 58 "answer": false, 59 "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single runs." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": true, 65 "answer": true, 66 "justification": "Three baselines are compared: Standard IO, Chain of Thought (CoT), and Tree of Thoughts (ToT)." 67 }, 68 "baselines_contemporary": { 69 "applies": true, 70 "answer": true, 71 "justification": "CoT (2022) and ToT (2023) are recent and standard baselines for LLM reasoning evaluation." 72 }, 73 "ablation_study": { 74 "applies": true, 75 "answer": false, 76 "justification": "No ablation study is presented. The system has multiple components (ToT Reasoners, Thought Validator, consensus voting, iterative refinement) but none are individually ablated." 77 }, 78 "multiple_metrics": { 79 "applies": true, 80 "answer": false, 81 "justification": "Only accuracy is reported as an evaluation metric." 82 }, 83 "human_evaluation": { 84 "applies": true, 85 "answer": false, 86 "justification": "No human evaluation is included. The paper makes claims about 'trustworthiness' of reasoning but only uses automated accuracy." 87 }, 88 "held_out_test_set": { 89 "applies": true, 90 "answer": true, 91 "justification": "They use a random subset of 500 samples from the GSM8K test set. No tuning on this set is described." 92 }, 93 "per_category_breakdown": { 94 "applies": true, 95 "answer": false, 96 "justification": "Only overall accuracy numbers per model are reported. No breakdown by problem difficulty, type, or category." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": true, 101 "justification": "Section 5 discusses failure modes: fixed tree depth causing unnecessary complexity for easy problems and insufficient depth for hard ones. The appendix shows detailed examples of incorrect reasoning." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "The paper notes that ToT and their method show diminishing returns for models that already perform well on standard IO (e.g., GPT-4o-mini and Llama-3.1-70B show smaller improvements)." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The abstract claims 'outperforming the standard ToT strategy by an average 5.6% across four LLMs' which is supported by Table 1 results." 114 }, 115 "causal_claims_justified": { 116 "applies": true, 117 "answer": false, 118 "justification": "The paper claims the Thought Validator 'enhances' and 'improves' reasoning but does not ablate the Validator component separately from the multi-agent setup to establish causality." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": false, 123 "justification": "Title says 'Improving LLM Reasoning' broadly but evaluation is only on GSM8K (arithmetic reasoning). The paper acknowledges this in limitations but the title and framing overclaim scope." 124 }, 125 "alternative_explanations_discussed": { 126 "applies": true, 127 "answer": false, 128 "justification": "No discussion of alternative explanations. The improvement could be due to simple ensemble effects (more compute/calls) rather than the specific Validator mechanism, but this is not addressed." 129 }, 130 "proxy_outcome_distinction": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper claims the method improves 'trustworthiness' of reasoning but only measures accuracy on GSM8K. No discussion of the gap between accuracy and trustworthiness." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": true, 140 "justification": "Exact model versions specified: GPT-3.5-turbo-0125, GPT-4o-mini-2024-07-18, Llama-3.1-8B, Llama-3.1-70B." 141 }, 142 "prompts_provided": { 143 "applies": true, 144 "answer": true, 145 "justification": "Full prompt text for all components (IO, CoT, ToT, and Verifier) is provided in the Appendix." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": true, 150 "justification": "Temperature=1, top_p=1 for IO/CoT/ToT; temperature=0.5, top_p=0.4 for Validator. Tree depth=2, width=5." 151 }, 152 "scaffolding_described": { 153 "applies": true, 154 "answer": true, 155 "justification": "The multi-agent scaffolding is described in detail in Section 3: parallel Reasoner agents with ToT, state evaluation, Thought Validator, consensus voting, and iterative refinement. Figure 1 provides a workflow diagram." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": false, 160 "justification": "The paper says 'a random subset of 500 samples from the GSM8K dataset as the test set' without describing how the random selection was done or providing a seed." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 5 is titled 'Limitations and Conclusion' and contains substantive discussion of limitations." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 5 discusses specific threats: fixed tree depth causing problems for both easy and hard tasks, high computational cost (token usage quantified), and single-benchmark evaluation." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section 5 explicitly states: 'while our evaluation on GSM8K demonstrates the effectiveness of our approach for arithmetic reasoning, testing on additional reasoning-intensive benchmarks would help establish the method's generalizability.'" 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": false, 184 "justification": "No raw experimental outputs (model responses, reasoning trees, per-example results) are made available." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "GSM8K is a well-known benchmark and the paper describes using a random 500-sample subset from it." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants. Data source is a standard benchmark (GSM8K)." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": false, 199 "justification": "No documentation of data pipeline: how 500 samples were selected, how model outputs were collected and scored, or how consensus was tracked." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": false, 206 "justification": "No funding information or acknowledgments section is present in the paper. One author is from Peraton Labs (industry) but no funding disclosure." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Author affiliations are clearly listed: University of Texas at San Antonio and Peraton Labs." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": false, 216 "justification": "Funding not disclosed, so independence cannot be assessed. One author is affiliated with Peraton Labs, a defense contractor, with no disclosure of potential interest." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests or financial interests statement is present in the paper." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": true, 227 "answer": false, 228 "justification": "No training data cutoff dates stated for any of the four models used." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": true, 232 "answer": false, 233 "justification": "GSM8K was published in 2021. All models used may have been trained on it. No discussion of potential overlap." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": true, 237 "answer": false, 238 "justification": "GSM8K is a well-known benchmark likely in training data of GPT-3.5-turbo and GPT-4o-mini. No contamination analysis is performed." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants in this study." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants in this study." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": true, 282 "justification": "Token usage per question reported: CoT 256 tokens vs ToT 4000 tokens for GPT-3.5-turbo, 341 vs 10,600 for GPT-4o-mini. API calls per problem (~20 per Reasoner) also noted." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": true, 287 "justification": "Hardware specified (4x NVIDIA DGX A100 80GB GPUs) and total time (18 hours for all experiments in parallel)." 288 } 289 }, 290 "experimental_rigor": { 291 "seed_sensitivity_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be single-run." 295 }, 296 "number_of_runs_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "The number of experimental runs is not stated. Results are presented without indicating how many runs produced them." 300 }, 301 "hyperparameter_search_budget": { 302 "applies": true, 303 "answer": false, 304 "justification": "The Validator uses different temperature/top_p (0.5/0.4) than other components (1/1), but no search budget or justification for these specific values beyond a brief rationale." 305 }, 306 "best_config_selection_justified": { 307 "applies": true, 308 "answer": false, 309 "justification": "Tree depth=2 and width=5 are stated as following Yao et al., but no exploration of alternatives or validation set selection is described." 310 }, 311 "multiple_comparison_correction": { 312 "applies": false, 313 "answer": false, 314 "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable." 315 }, 316 "self_comparison_bias_addressed": { 317 "applies": true, 318 "answer": false, 319 "justification": "The authors implement their own version of ToT and compare against it without acknowledging potential bias from implementing baselines." 320 }, 321 "compute_budget_vs_performance": { 322 "applies": true, 323 "answer": false, 324 "justification": "The method uses ~20 API calls per Reasoner agent times multiple agents, dramatically more compute than baselines, but performance is not shown as a function of compute. The token cost is mentioned in limitations but not normalized against performance." 325 }, 326 "benchmark_construct_validity": { 327 "applies": true, 328 "answer": false, 329 "justification": "GSM8K is used as a proxy for 'reasoning' capability without discussing whether arithmetic word problems measure the kind of reasoning the paper claims to improve." 330 }, 331 "scaffold_confound_addressed": { 332 "applies": true, 333 "answer": false, 334 "justification": "The proposed method adds multiple layers of scaffolding (parallel agents, validator, voting, iterative refinement) compared to baselines. Improvements could be due to the scaffolding rather than the specific ToT+Validator design, but this confound is not discussed." 335 } 336 }, 337 "data_leakage": { 338 "temporal_leakage_addressed": { 339 "applies": true, 340 "answer": false, 341 "justification": "GSM8K was published in 2021. All models (GPT-3.5-turbo, GPT-4o-mini, Llama-3.1) were trained after 2021 and may have seen GSM8K solutions. Not discussed." 342 }, 343 "feature_leakage_addressed": { 344 "applies": true, 345 "answer": false, 346 "justification": "No discussion of whether the evaluation setup leaks information not available in real usage." 347 }, 348 "non_independence_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "No discussion of independence between the 500 sampled test examples or potential similarity to training data." 352 }, 353 "leakage_detection_method": { 354 "applies": true, 355 "answer": false, 356 "justification": "No concrete leakage detection or prevention method is applied." 357 } 358 } 359 }, 360 "claims": [ 361 { 362 "claim": "Multi-agent ToT with Thought Validator outperforms standard ToT by an average 5.6% across four LLMs on GSM8K", 363 "evidence": "Table 1 shows MA-ToT with Validator scores: 84.2, 92.2, 89.0, 94.8 vs ToT: 75.4, 91.6, 80.2, 92.8 across four models.", 364 "supported": "moderate" 365 }, 366 { 367 "claim": "The improvement is 8.8 percentage points for GPT-3.5-turbo (from 75.4% to 84.2%)", 368 "evidence": "Table 1 shows ToT at 75.4% and MA-ToT+Validator at 84.2% for GPT-3.5-turbo.", 369 "supported": "weak" 370 }, 371 { 372 "claim": "ToT benefits are more pronounced for weaker models and diminish for stronger ones", 373 "evidence": "Table 1: GPT-3.5-turbo gains 8.8pp, Llama-3.1-8B gains 8.8pp, but GPT-4o-mini gains only 0.6pp and Llama-3.1-70B gains 2.0pp.", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "The Thought Validator prevents incorrect reasoning from leading to errors in the final answer", 378 "evidence": "Appendix examples show the Validator rejecting incorrect reasoning from R1 and R2 while validating correct R3 reasoning.", 379 "supported": "weak" 380 } 381 ], 382 "methodology_tags": ["benchmark-eval"], 383 "key_findings": "The paper proposes combining Tree-of-Thought reasoning with a Thought Validator agent in a multi-agent framework. On a 500-sample subset of GSM8K, the approach improves accuracy over standard ToT by 0.6-8.8 percentage points across four LLMs, with larger gains for weaker models. The method is computationally expensive, requiring ~20 API calls per Reasoner agent, with token usage increasing from 256 (CoT) to 4000+ (ToT) per question.", 384 "red_flags": [ 385 { 386 "flag": "No error bars or multiple runs", 387 "detail": "All results are single point estimates on a 500-sample subset with no variance reporting, making it impossible to assess whether differences are statistically meaningful." 388 }, 389 { 390 "flag": "Compute-unfair comparison", 391 "detail": "MA-ToT+Validator uses orders of magnitude more compute (multiple agents × ~20 API calls each + validation + potential iterative rounds) than baselines (single call for IO/CoT). The improvement may simply be due to more compute, not the specific method." 392 }, 393 { 394 "flag": "Single benchmark evaluation", 395 "detail": "Claims about 'improving LLM reasoning' are based solely on GSM8K arithmetic reasoning. No evaluation on other reasoning types." 396 }, 397 { 398 "flag": "No ablation study", 399 "detail": "Cannot determine whether gains come from multi-agent voting, the ToT component, the Validator, or the iterative refinement. Missing: ToT-only multi-agent without Validator, Validator without ToT, etc." 400 }, 401 { 402 "flag": "Benchmark contamination risk", 403 "detail": "GSM8K (2021) is likely in the training data of all four models tested. High baseline scores (91-93% for stronger models) may reflect memorization rather than reasoning." 404 } 405 ], 406 "cited_papers": [ 407 { 408 "title": "Training verifiers to solve math word problems", 409 "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian", "Mark Chen"], 410 "arxiv_id": "2110.14168", 411 "relevance": "Introduces GSM8K benchmark used for evaluation; relevant to LLM reasoning evaluation methodology." 412 }, 413 { 414 "title": "Improving factuality and reasoning in language models through multiagent debate", 415 "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba", "Joshua B. Tenenbaum", "Igor Mordatch"], 416 "arxiv_id": "2305.14325", 417 "relevance": "Core multi-agent debate approach for improving LLM reasoning, directly relevant to agentic AI methodology." 418 }, 419 { 420 "title": "Tree of thoughts: Deliberate problem solving with large language models", 421 "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"], 422 "year": 2023, 423 "arxiv_id": "2305.10601", 424 "relevance": "Foundational ToT method that this paper extends; key technique for structured LLM reasoning." 425 }, 426 { 427 "title": "Chain-of-thought prompting elicits reasoning in large language models", 428 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 429 "arxiv_id": "2201.11903", 430 "relevance": "Foundational CoT prompting technique for LLM reasoning, used as baseline." 431 }, 432 { 433 "title": "Large language model based multi-agents: A survey of progress and challenges", 434 "authors": ["Taicheng Guo", "Xiuying Chen", "Yaqi Wang"], 435 "year": 2024, 436 "relevance": "Survey of multi-agent LLM systems, directly relevant to agentic AI survey scope." 437 }, 438 { 439 "title": "Counterfactual debating with preset stances for hallucination elimination of LLMs", 440 "authors": ["Yi Fang", "Moxin Li", "Wenjie Wang"], 441 "arxiv_id": "2406.11514", 442 "relevance": "Multi-agent debate framework for mitigating LLM hallucinations, relevant to AI safety and agentic systems." 443 }, 444 { 445 "title": "HaluEval: A large-scale hallucination evaluation benchmark for large language models", 446 "authors": ["Junyi Li", "Xiaoxue Cheng", "Wayne Xin Zhao"], 447 "arxiv_id": "2305.11747", 448 "relevance": "Hallucination evaluation benchmark for LLMs, relevant to AI evaluation methodology." 449 }, 450 { 451 "title": "Towards CausalGPT: A multi-agent approach for faithful knowledge reasoning via promoting causal consistency in LLMs", 452 "authors": ["Ziyi Tang", "Ruilin Wang", "Weixing Chen"], 453 "arxiv_id": "2308.11914", 454 "relevance": "Multi-agent approach for reasoning verification in LLMs." 455 }, 456 { 457 "title": "The llama 3 herd of models", 458 "authors": ["Abhimanyu Dubey"], 459 "year": 2024, 460 "arxiv_id": "2407.21783", 461 "relevance": "Describes Llama 3.1 models used in experiments; relevant to LLM capability evaluation." 462 } 463 ] 464 }