scan-v5.json (25954B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Improving LLM Reasoning with Multi-Agent Tree-of-Thought Validator Agent", 6 "authors": [ 7 "Fatemeh Haji", 8 "Mazal Bethany", 9 "Maryam Tabar", 10 "Cho-Yu Jason Chiang", 11 "Anthony Rios", 12 "Peyman Najafirad" 13 ], 14 "year": 2024, 15 "venue": "NeurIPS 2024 Workshop on Safe and Trustworthy Agents", 16 "arxiv_id": "2409.11527", 17 "doi": "10.48550/arXiv.2409.11527" 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "Abstract claims 5.6% improvement over ToT; paper reports 5.6-8.8pp improvements depending on model. Claim about filtering flawed reasoning is demonstrated in appendix examples.", 25 "source": "haiku" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": false, 30 "justification": "Paper compares against ToT baseline but lacks ablation isolating validator contribution from ensemble voting effect. Cannot determine if improvement comes from validation or just from multiple agents with voting.", 31 "source": "haiku" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": true, 36 "justification": "Evaluation limited to GSM8K arithmetic reasoning. Conclusion explicitly states 'particularly for complex arithmetic reasoning tasks.' Authors acknowledge 'testing on additional reasoning-intensive benchmarks would help establish generalizability.'", 37 "source": "haiku" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": false, 42 "justification": "Improvement could stem from: (1) more compute tokens (4000 vs 256 for CoT), (2) ensemble voting, or (3) actual validation. Paper acknowledges token overhead but doesn't explore whether improvement comes from validation vs ensemble.", 43 "source": "haiku" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": false, 48 "justification": "Paper claims 'improving LLM reasoning' but measures only final answer correctness on math problems. Doesn't distinguish reasoning quality from luck; no analysis of whether reasoning chains are actually sound.", 49 "source": "haiku" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": true, 56 "justification": "Section 5 titled 'Limitations and Conclusion' discusses fixed tree depth/width constraints, computational expense with token counts, and generalizability beyond GSM8K.", 57 "source": "haiku" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": true, 62 "justification": "Specific threats: fixed depth=2/width=5 suboptimal for easy problems (unnecessary complexity) and hard problems (insufficient depth). Token usage increases 10-40x. Generalizes only to arithmetic. Validator sometimes misses errors (Table 4).", 63 "source": "haiku" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": true, 68 "justification": "Paper explicitly states results apply to 'complex arithmetic reasoning tasks' on GSM8K. Notes trade-off considerations 'in resource-constrained environments.' Does not claim broad reasoning improvement.", 69 "source": "haiku" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": false, 76 "justification": "No funding source stated in paper. No grants, sponsorships, or financial support mentioned.", 77 "source": "haiku" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "All author affiliations listed: UTSA Secure AI Lab (5 authors), Peraton Labs (1 author). No apparent conflict with evaluated system.", 83 "source": "haiku" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": false, 87 "answer": false, 88 "justification": "No funder identified.", 89 "source": "haiku" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "No competing interests statement. No patents, equity, or consulting relationships disclosed.", 95 "source": "haiku" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": false, 102 "justification": "Key terms underspecified: 'reasoning' used without formal definition beyond math problem-solving; 'trustworthiness' mentioned in abstract/conclusion but never formally defined; 'reasoning paths' and 'reasoning branches' used interchangeably.", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "Three explicit contributions stated: (1) ToT integration into multi-agent framework, (2) Thought Validator agent design, (3) experimental results on GSM8K. Reader knows the paper claims a method + validation approach.", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "Related work section engages with: multi-agent systems (CFMAD, CausalGPT), ToT/CoT baselines, over-generation strategies. Positions contribution as addressing 'shallow exploration' limitation in Reasoner agents.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "empirical": { 121 "artifacts": { 122 "code_released": { 123 "applies": true, 124 "answer": true, 125 "justification": "GitHub link provided: https://github.com/SecureAIAutonomyLab/MA-ToT. Code is publicly available.", 126 "source": "haiku" 127 }, 128 "data_released": { 129 "applies": true, 130 "answer": true, 131 "justification": "Evaluated on GSM8K, a public benchmark. No new data collected; standard public dataset used unmodified.", 132 "source": "haiku" 133 }, 134 "environment_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "Hardware specified (4× DGX A100 80GB) and model versions exact (GPT-3.5-turbo-0125, GPT-4o-mini-2024-07-18, Llama 3.1 8B/70B), but no requirements.txt, Dockerfile, or Python version specified.", 138 "source": "haiku" 139 }, 140 "reproduction_instructions": { 141 "applies": true, 142 "answer": false, 143 "justification": "Appendix provides exact prompts used, but no step-by-step runnable instructions. No script commands like 'python run.py --dataset gsm8k'. GitHub code presumably contains this.", 144 "source": "haiku" 145 } 146 }, 147 "statistical_methodology": { 148 "confidence_intervals_or_error_bars": { 149 "applies": true, 150 "answer": false, 151 "justification": "Table 1 reports raw accuracy percentages with no confidence intervals, error bars, or variance measures. Single 500-sample test set, no repeated runs reported.", 152 "source": "haiku" 153 }, 154 "significance_tests": { 155 "applies": true, 156 "answer": false, 157 "justification": "Comparative claims made (5.6-8.8pp improvements) but no statistical significance testing, p-values, or t-tests. Cannot determine if differences are significant vs. noise.", 158 "source": "haiku" 159 }, 160 "effect_sizes_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Improvements in percentage points reported: GPT-3.5 8.8pp, GPT-4o-mini 0.6pp, Llama-8B 8.8pp, Llama-70B 2pp. These are effect sizes.", 164 "source": "haiku" 165 }, 166 "sample_size_justified": { 167 "applies": true, 168 "answer": false, 169 "justification": "500 samples chosen from GSM8K's 8.5K test set, but no justification provided. No power analysis, no discussion of adequacy.", 170 "source": "haiku" 171 }, 172 "variance_reported": { 173 "applies": true, 174 "answer": false, 175 "justification": "No standard deviation, confidence intervals, or multiple run results. Single pass on fixed test set.", 176 "source": "haiku" 177 } 178 }, 179 "evaluation_design": { 180 "baselines_included": { 181 "applies": true, 182 "answer": true, 183 "justification": "Three baselines: IO prompting, Chain-of-Thought, Tree-of-Thoughts. Results compared against all three.", 184 "source": "haiku" 185 }, 186 "baselines_contemporary": { 187 "applies": true, 188 "answer": true, 189 "justification": "CoT (Wei et al. 2022), ToT (Yao et al. 2023) are current methods. Baselines are contemporary as of 2024.", 190 "source": "haiku" 191 }, 192 "ablation_study": { 193 "applies": true, 194 "answer": false, 195 "justification": "No ablation isolating validator contribution vs ensemble voting effect. No comparison of: (1) ToT alone, (2) multiple ToT runs with simple majority voting, (3) same approach with validator. Cannot disentangle mechanisms.", 196 "source": "haiku" 197 }, 198 "multiple_metrics": { 199 "applies": true, 200 "answer": false, 201 "justification": "Only accuracy metric reported. No error analysis, latency, cost-normalized performance, or reasoning quality metrics.", 202 "source": "haiku" 203 }, 204 "human_evaluation": { 205 "applies": true, 206 "answer": false, 207 "justification": "No human evaluation of reasoning chain quality or validator accuracy. Only automatic correctness of final answers measured.", 208 "source": "haiku" 209 }, 210 "held_out_test_set": { 211 "applies": true, 212 "answer": true, 213 "justification": "500-sample random subset from GSM8K used as held-out test set, separate from any training.", 214 "source": "haiku" 215 }, 216 "per_category_breakdown": { 217 "applies": true, 218 "answer": false, 219 "justification": "Results only disaggregated by model (GPT-3.5, GPT-4o, Llama 8B, 70B). No breakdown by problem difficulty, type, or category.", 220 "source": "haiku" 221 }, 222 "failure_cases_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "Appendix shows example failures (Problem 1 rounds 1-2 where validators mark reasoning invalid), but no systematic analysis of failure patterns or error types.", 226 "source": "haiku" 227 }, 228 "negative_results_reported": { 229 "applies": true, 230 "answer": false, 231 "justification": "All results positive across all models. Paper notes improvements smaller on capable models (0.6pp for GPT-4o-mini) but frames this as model-dependent, not negative.", 232 "source": "haiku" 233 } 234 }, 235 "setup_transparency": { 236 "model_versions_specified": { 237 "applies": true, 238 "answer": true, 239 "justification": "Exact API versions: GPT-3.5-turbo-0125, GPT-4o-mini-2024-07-18, Llama 3.1 8B/70B with HuggingFace references. No ambiguity on model snapshots.", 240 "source": "haiku" 241 }, 242 "prompts_provided": { 243 "applies": true, 244 "answer": true, 245 "justification": "Appendix provides complete, exact prompts: IO, CoT, ToT (with reasoning format), and Verifier prompts. All templates filled with actual content.", 246 "source": "haiku" 247 }, 248 "hyperparameters_reported": { 249 "applies": true, 250 "answer": true, 251 "justification": "Temperature (1.0 for IO/CoT/ToT, 0.5 for Validator), top_p (1.0 vs 0.4), tree depth=2, width=5, Reasoner count (3 from examples).", 252 "source": "haiku" 253 }, 254 "scaffolding_described": { 255 "applies": true, 256 "answer": true, 257 "justification": "Detailed description: ToT decomposition, state evaluation (voting), path selection (greedy), reasoning branch construction, Thought Validator (logic/fact/completeness checks), consensus voting, iterative refinement.", 258 "source": "haiku" 259 }, 260 "data_preprocessing_documented": { 261 "applies": true, 262 "answer": true, 263 "justification": "Minimal: 'random subset of 500 samples from GSM8K.' No filtering, augmentation, or preprocessing applied.", 264 "source": "haiku" 265 } 266 }, 267 "data_integrity": { 268 "raw_data_available": { 269 "applies": true, 270 "answer": true, 271 "justification": "GSM8K is publicly available benchmark (Cobbe et al., 2021). Raw data accessible for independent verification.", 272 "source": "haiku" 273 }, 274 "data_collection_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No new data collected. GSM8K is existing public benchmark. Not applicable.", 278 "source": "haiku" 279 }, 280 "recruitment_methods_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in study. Not applicable.", 284 "source": "haiku" 285 }, 286 "data_pipeline_documented": { 287 "applies": true, 288 "answer": true, 289 "justification": "Pipeline: select 500 random samples from GSM8K → run through 4 LLMs with 3 reasoning methods + validator → aggregate results into Table 1. Simple pipeline, documented.", 290 "source": "haiku" 291 } 292 }, 293 "contamination": { 294 "training_cutoff_stated": { 295 "applies": true, 296 "answer": true, 297 "justification": "Model cutoff dates known: GPT-3.5 (April 2023), GPT-4o-mini (July 2024), Llama 3.1 (July 2024). GSM8K published 2021, before all cutoffs.", 298 "source": "haiku" 299 }, 300 "train_test_overlap_discussed": { 301 "applies": true, 302 "answer": false, 303 "justification": "Potential overlap not discussed. GSM8K predates models' training data, so contamination unlikely, but paper does not address this explicitly.", 304 "source": "haiku" 305 }, 306 "benchmark_contamination_addressed": { 307 "applies": true, 308 "answer": false, 309 "justification": "GSM8K created 2021, before all evaluated model training cutoffs. Contamination unlikely but not explicitly confirmed or ruled out in paper.", 310 "source": "haiku" 311 } 312 }, 313 "human_studies": { 314 "pre_registered": { 315 "applies": false, 316 "answer": false, 317 "justification": "No human participants. Not applicable.", 318 "source": "haiku" 319 }, 320 "irb_or_ethics_approval": { 321 "applies": false, 322 "answer": false, 323 "justification": "No human participants. Not applicable.", 324 "source": "haiku" 325 }, 326 "demographics_reported": { 327 "applies": false, 328 "answer": false, 329 "justification": "No human participants. Not applicable.", 330 "source": "haiku" 331 }, 332 "inclusion_exclusion_criteria": { 333 "applies": false, 334 "answer": false, 335 "justification": "No human participants. Not applicable.", 336 "source": "haiku" 337 }, 338 "randomization_described": { 339 "applies": false, 340 "answer": false, 341 "justification": "No human participants. Not applicable.", 342 "source": "haiku" 343 }, 344 "blinding_described": { 345 "applies": false, 346 "answer": false, 347 "justification": "No human participants. Not applicable.", 348 "source": "haiku" 349 }, 350 "attrition_reported": { 351 "applies": false, 352 "answer": false, 353 "justification": "No human participants. Not applicable.", 354 "source": "haiku" 355 } 356 }, 357 "cost_and_practicality": { 358 "inference_cost_reported": { 359 "applies": true, 360 "answer": true, 361 "justification": "Token usage detailed: GPT-3.5-turbo 256 (CoT) → 4,000 (ToT); GPT-4o-mini 341 → 10,600. ~20 API calls per problem × N agents + validation. Latency not reported.", 362 "source": "haiku" 363 }, 364 "compute_budget_stated": { 365 "applies": true, 366 "answer": false, 367 "justification": "Experiment runtime stated (18 hours on 4× DGX A100), but total cost in API dollars not calculated. Per-deployment cost not estimated.", 368 "source": "haiku" 369 } 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "Multi-agent ToT with Thought Validator improves accuracy on arithmetic reasoning tasks", 376 "evidence": "Table 1 shows 5.6-8.8pp improvements over baseline ToT across four models on GSM8K. GPT-3.5-turbo: 75.4% → 84.2%, Llama-8B: 80.2% → 89.0%.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "Thought Validator agent effectively identifies and filters invalid reasoning branches", 381 "evidence": "Appendix examples show validator marking R1/R2 outputs as invalid in Problem 1, with R3's valid reasoning surviving. However, Table 4 shows validator missed a calculation error in R2 round 3.", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "Validator approach is superior to majority voting on unfiltered paths", 386 "evidence": "Implicit comparison: validator voting vs. plain ToT majority voting. No direct ablation provided.", 387 "supported": "weak" 388 }, 389 { 390 "claim": "Method scales across different model sizes and capabilities", 391 "evidence": "Evaluated on 4 models ranging 8B to GPT-4o-mini. Improvements vary (0.6pp for strong models, 8.8pp for weak), indicating differential benefit.", 392 "supported": "strong" 393 }, 394 { 395 "claim": "Fixed tree depth/width constraints limit reasoning path exploration", 396 "evidence": "Limitations section observes: predetermined depth=2 adds unnecessary complexity for easy problems, insufficient for hard problems. Qualitative observation only, no measurement.", 397 "supported": "moderate" 398 }, 399 { 400 "claim": "Iterative refinement with validator feedback improves consensus", 401 "evidence": "Examples show iterative rounds (Problem 1: 3 rounds until consensus on $26). Quantitative consensus rate not reported.", 402 "supported": "moderate" 403 } 404 ], 405 "methodology_tags": [ 406 "benchmark-eval", 407 "case-study" 408 ], 409 "key_findings": "The paper proposes a multi-agent reasoning system combining Tree-of-Thoughts with a Thought Validator agent, achieving 5.6–8.8 percentage point improvements over baseline ToT on GSM8K arithmetic reasoning across four models. Improvements are largest for weaker models (GPT-3.5-turbo: +8.8pp) and diminish for stronger models (GPT-4o-mini: +0.6pp). However, the approach incurs 10–40× token overhead (4000–10,600 tokens vs. 256–341 for CoT) and lacks ablations isolating the validator's contribution from ensemble voting effects, limiting mechanistic understanding.", 410 "red_flags": [ 411 { 412 "flag": "No ablation of validator effect vs ensemble voting", 413 "detail": "Missing comparison of: (1) single ToT run, (2) multiple ToT runs with simple majority voting, (3) same with validator. Cannot determine if improvement comes from validation or just more compute/ensemble." 414 }, 415 { 416 "flag": "Single benchmark evaluation only", 417 "detail": "All results on GSM8K (arithmetic only). No evaluation on logic, commonsense, code reasoning, or other domains claimed in abstract ('LLM Reasoning' broadly)." 418 }, 419 { 420 "flag": "No statistical significance testing", 421 "detail": "Single 500-sample test set, no error bars or confidence intervals. Cannot determine if reported improvements (0.6–8.8pp) are significant vs. noise." 422 }, 423 { 424 "flag": "Validator accuracy not measured", 425 "detail": "How often does validator correctly identify invalid reasoning? Examples show it sometimes misses errors (Table 4, R2 round 3: calculation error not caught). No precision/recall reported." 426 }, 427 { 428 "flag": "Unfair baseline comparison", 429 "detail": "Validator uses temperature=0.5, top_p=0.4 while baselines use temperature=1.0, top_p=1.0. Different hyperparameter regimes make comparison confounded." 430 }, 431 { 432 "flag": "No sample size justification", 433 "detail": "500 samples selected from 8.5K test set without justification. No power analysis or adequacy discussion." 434 }, 435 { 436 "flag": "Conflates answer correctness with reasoning quality", 437 "detail": "Claims 'improving reasoning' but only measures final answer correctness. No human evaluation of whether reasoning chains are actually sound or trustworthy." 438 }, 439 { 440 "flag": "Generalization claims unsupported by scope", 441 "detail": "Title claims improvement to 'LLM Reasoning' (broad), but evaluation limited to arithmetic. Abstract doesn't reflect narrow scope; abstract claims should match tested domain." 442 } 443 ], 444 "cited_papers": [ 445 { 446 "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models", 447 "relevance": "Core method basis; paper integrates ToT into multi-agent framework." 448 }, 449 { 450 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 451 "relevance": "Foundational baseline (Wei et al.); paper builds on CoT for reasoner agents." 452 }, 453 { 454 "title": "Improving Factuality and Reasoning in Language Models Through Multiagent Debate", 455 "relevance": "Multi-agent reasoning precedent; paper adopts similar collaborative approach (Du et al.)." 456 }, 457 { 458 "title": "Counterfactual Debating with Preset Stances for Hallucination Elimination of LLMs", 459 "relevance": "Validation mechanism precedent; similar approach to filtering flawed reasoning (Fang et al.)." 460 }, 461 { 462 "title": "Towards CausalGPT: A Multi-Agent Approach for Faithful Knowledge Reasoning", 463 "relevance": "Evaluative layer for reasoning verification; similar validator concept (Tang et al.)." 464 }, 465 { 466 "title": "HaluEval: A Large-Scale Hallucination Evaluation Benchmark for LLMs", 467 "relevance": "Evaluates LLM's ability to identify errors; supports claim that LLMs can validate reasoning (Li et al.)." 468 }, 469 { 470 "title": "Training Verifiers to Solve Math Word Problems (GSM8K)", 471 "relevance": "Benchmark used for evaluation; foundational dataset for arithmetic reasoning tasks (Cobbe et al.)." 472 }, 473 { 474 "title": "The Llama 3 Herd of Models", 475 "relevance": "Model used in experiments (Llama 3.1 8B/70B); baseline for comparison (Dubey et al.)." 476 } 477 ], 478 "engagement_factors": { 479 "practical_relevance": { 480 "score": 2, 481 "justification": "Code released and prompts provided, but 10–40× token overhead (4000–10,600 vs. 256 tokens) limits practical deployment to high-value arithmetic reasoning scenarios." 482 }, 483 "surprise_contrarian": { 484 "score": 1, 485 "justification": "Multi-agent reasoning + validation is intuitive; no surprising findings. Results align with prior expectations that more agents and checking help accuracy." 486 }, 487 "fear_safety": { 488 "score": 1, 489 "justification": "Mentions 'trustworthiness' and ethical considerations in social impact statement, but core contribution is accuracy on math problems, not safety or alignment." 490 }, 491 "drama_conflict": { 492 "score": 0, 493 "justification": "Straightforward method paper with positive results. No controversies, competing claims, or dramatic findings." 494 }, 495 "demo_ability": { 496 "score": 2, 497 "justification": "Code and prompts available; could replicate with access to GPT/Llama APIs. Requires infrastructure and API budget, limiting quick demos." 498 }, 499 "brand_recognition": { 500 "score": 1, 501 "justification": "Authors from UTSA Secure AI Lab (emerging) and Peraton Labs (defense contractor, low visibility). Published at NeurIPS 2024 workshop (not main conference)." 502 } 503 }, 504 "hn_data": { 505 "threads": [ 506 { 507 "hn_id": "37985510", 508 "title": "Magnetic Fusion Plasma Drive", 509 "points": 14, 510 "comments": 1, 511 "url": "https://news.ycombinator.com/item?id=37985510", 512 "created_at": "2023-10-23T13:43:37Z" 513 }, 514 { 515 "hn_id": "41291801", 516 "title": "MINT-1T: Open-Source Multimodal Dataset with One Trillion Tokens", 517 "points": 3, 518 "comments": 0, 519 "url": "https://news.ycombinator.com/item?id=41291801", 520 "created_at": "2024-08-19T15:19:51Z" 521 }, 522 { 523 "hn_id": "41633709", 524 "title": "A Preliminary Study of O1 in Medicine: Are We Closer to an AI Doctor", 525 "points": 2, 526 "comments": 0, 527 "url": "https://news.ycombinator.com/item?id=41633709", 528 "created_at": "2024-09-24T06:46:42Z" 529 }, 530 { 531 "hn_id": "41319942", 532 "title": "The Vizier Gaussian Process Bandit Algorithm", 533 "points": 2, 534 "comments": 0, 535 "url": "https://news.ycombinator.com/item?id=41319942", 536 "created_at": "2024-08-22T13:19:32Z" 537 }, 538 { 539 "hn_id": "41334001", 540 "title": "The Vizier Gaussian Process Bandit Algorithm", 541 "points": 1, 542 "comments": 1, 543 "url": "https://news.ycombinator.com/item?id=41334001", 544 "created_at": "2024-08-23T23:26:19Z" 545 }, 546 { 547 "hn_id": "41651058", 548 "title": "Towards Empathetic Conversational Recommender Systems", 549 "points": 1, 550 "comments": 0, 551 "url": "https://news.ycombinator.com/item?id=41651058", 552 "created_at": "2024-09-25T19:37:54Z" 553 } 554 ], 555 "top_points": 14, 556 "total_points": 23, 557 "total_comments": 2 558 } 559 }