scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22388B)
      1 {
      2   "scan_version": 2,
      3   "active_modules": ["experimental_rigor", "data_leakage"],
      4   "paper": {
      5     "title": "Improving LLM Reasoning with Multi-Agent Tree-of-Thought Validator Agent",
      6     "authors": ["Fatemeh Haji", "Mazal Bethany", "Maryam Tabar", "Cho-Yu Jason Chiang", "Anthony Rios", "Peyman Najafirad"],
      7     "year": 2024,
      8     "venue": "1st Workshop on Safe and Trustworthy Agents @NeurIPS 2024",
      9     "arxiv_id": "2409.11527",
     10     "doi": "10.48550/arXiv.2409.11527"
     11   },
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "GitHub URL provided in abstract: https://github.com/SecureAIAutonomyLab/MA-ToT"
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "GSM8K is a publicly available benchmark dataset. The paper uses a random subset of 500 samples from it."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "Hardware is mentioned (4x NVIDIA DGX A100 80GB) but no requirements.txt, Dockerfile, or library version details are provided."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided in the paper. A GitHub link is given but the paper itself contains no README-level reproduction guidance."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Table 1 reports only point estimates (e.g., 84.2%) with no confidence intervals or error bars."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper claims 'outperforms' based on comparing raw accuracy numbers without any statistical significance tests."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper reports percentage point improvements with baseline context, e.g., '8.8 percentage points over ToT for GPT-3.5-turbo (from 75.4% to 84.2%)'."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper uses 500 samples from GSM8K with no justification for why 500 was chosen or whether this is sufficient for the claimed comparisons."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single runs."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Three baselines are compared: Standard IO, Chain of Thought (CoT), and Tree of Thoughts (ToT)."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "CoT (2022) and ToT (2023) are recent and standard baselines for LLM reasoning evaluation."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No ablation study is presented. The system has multiple components (ToT Reasoners, Thought Validator, consensus voting, iterative refinement) but none are individually ablated."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "Only accuracy is reported as an evaluation metric."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No human evaluation is included. The paper makes claims about 'trustworthiness' of reasoning but only uses automated accuracy."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "They use a random subset of 500 samples from the GSM8K test set. No tuning on this set is described."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "Only overall accuracy numbers per model are reported. No breakdown by problem difficulty, type, or category."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 5 discusses failure modes: fixed tree depth causing unnecessary complexity for easy problems and insufficient depth for hard ones. The appendix shows detailed examples of incorrect reasoning."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper notes that ToT and their method show diminishing returns for models that already perform well on standard IO (e.g., GPT-4o-mini and Llama-3.1-70B show smaller improvements)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims 'outperforming the standard ToT strategy by an average 5.6% across four LLMs' which is supported by Table 1 results."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "The paper claims the Thought Validator 'enhances' and 'improves' reasoning but does not ablate the Validator component separately from the multi-agent setup to establish causality."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "Title says 'Improving LLM Reasoning' broadly but evaluation is only on GSM8K (arithmetic reasoning). The paper acknowledges this in limitations but the title and framing overclaim scope."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "No discussion of alternative explanations. The improvement could be due to simple ensemble effects (more compute/calls) rather than the specific Validator mechanism, but this is not addressed."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper claims the method improves 'trustworthiness' of reasoning but only measures accuracy on GSM8K. No discussion of the gap between accuracy and trustworthiness."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Exact model versions specified: GPT-3.5-turbo-0125, GPT-4o-mini-2024-07-18, Llama-3.1-8B, Llama-3.1-70B."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full prompt text for all components (IO, CoT, ToT, and Verifier) is provided in the Appendix."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Temperature=1, top_p=1 for IO/CoT/ToT; temperature=0.5, top_p=0.4 for Validator. Tree depth=2, width=5."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The multi-agent scaffolding is described in detail in Section 3: parallel Reasoner agents with ToT, state evaluation, Thought Validator, consensus voting, and iterative refinement. Figure 1 provides a workflow diagram."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "The paper says 'a random subset of 500 samples from the GSM8K dataset as the test set' without describing how the random selection was done or providing a seed."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 5 is titled 'Limitations and Conclusion' and contains substantive discussion of limitations."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 5 discusses specific threats: fixed tree depth causing problems for both easy and hard tasks, high computational cost (token usage quantified), and single-benchmark evaluation."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 5 explicitly states: 'while our evaluation on GSM8K demonstrates the effectiveness of our approach for arithmetic reasoning, testing on additional reasoning-intensive benchmarks would help establish the method's generalizability.'"
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No raw experimental outputs (model responses, reasoning trees, per-example results) are made available."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "GSM8K is a well-known benchmark and the paper describes using a random 500-sample subset from it."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. Data source is a standard benchmark (GSM8K)."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No documentation of data pipeline: how 500 samples were selected, how model outputs were collected and scored, or how consensus was tracked."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding information or acknowledgments section is present in the paper. One author is from Peraton Labs (industry) but no funding disclosure."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: University of Texas at San Antonio and Peraton Labs."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "Funding not disclosed, so independence cannot be assessed. One author is affiliated with Peraton Labs, a defense contractor, with no disclosure of potential interest."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No training data cutoff dates stated for any of the four models used."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "GSM8K was published in 2021. All models used may have been trained on it. No discussion of potential overlap."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "GSM8K is a well-known benchmark likely in training data of GPT-3.5-turbo and GPT-4o-mini. No contamination analysis is performed."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Token usage per question reported: CoT 256 tokens vs ToT 4000 tokens for GPT-3.5-turbo, 341 vs 10,600 for GPT-4o-mini. API calls per problem (~20 per Reasoner) also noted."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Hardware specified (4x NVIDIA DGX A100 80GB GPUs) and total time (18 hours for all experiments in parallel)."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be single-run."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The number of experimental runs is not stated. Results are presented without indicating how many runs produced them."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The Validator uses different temperature/top_p (0.5/0.4) than other components (1/1), but no search budget or justification for these specific values beyond a brief rationale."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Tree depth=2 and width=5 are stated as following Yao et al., but no exploration of alternatives or validation set selection is described."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": false,
    313         "answer": false,
    314         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors implement their own version of ToT and compare against it without acknowledging potential bias from implementing baselines."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The method uses ~20 API calls per Reasoner agent times multiple agents, dramatically more compute than baselines, but performance is not shown as a function of compute. The token cost is mentioned in limitations but not normalized against performance."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "GSM8K is used as a proxy for 'reasoning' capability without discussing whether arithmetic word problems measure the kind of reasoning the paper claims to improve."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The proposed method adds multiple layers of scaffolding (parallel agents, validator, voting, iterative refinement) compared to baselines. Improvements could be due to the scaffolding rather than the specific ToT+Validator design, but this confound is not discussed."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "GSM8K was published in 2021. All models (GPT-3.5-turbo, GPT-4o-mini, Llama-3.1) were trained after 2021 and may have seen GSM8K solutions. Not discussed."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup leaks information not available in real usage."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of independence between the 500 sampled test examples or potential similarity to training data."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No concrete leakage detection or prevention method is applied."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Multi-agent ToT with Thought Validator outperforms standard ToT by an average 5.6% across four LLMs on GSM8K",
    363       "evidence": "Table 1 shows MA-ToT with Validator scores: 84.2, 92.2, 89.0, 94.8 vs ToT: 75.4, 91.6, 80.2, 92.8 across four models.",
    364       "supported": "moderate"
    365     },
    366     {
    367       "claim": "The improvement is 8.8 percentage points for GPT-3.5-turbo (from 75.4% to 84.2%)",
    368       "evidence": "Table 1 shows ToT at 75.4% and MA-ToT+Validator at 84.2% for GPT-3.5-turbo.",
    369       "supported": "weak"
    370     },
    371     {
    372       "claim": "ToT benefits are more pronounced for weaker models and diminish for stronger ones",
    373       "evidence": "Table 1: GPT-3.5-turbo gains 8.8pp, Llama-3.1-8B gains 8.8pp, but GPT-4o-mini gains only 0.6pp and Llama-3.1-70B gains 2.0pp.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "The Thought Validator prevents incorrect reasoning from leading to errors in the final answer",
    378       "evidence": "Appendix examples show the Validator rejecting incorrect reasoning from R1 and R2 while validating correct R3 reasoning.",
    379       "supported": "weak"
    380     }
    381   ],
    382   "methodology_tags": ["benchmark-eval"],
    383   "key_findings": "The paper proposes combining Tree-of-Thought reasoning with a Thought Validator agent in a multi-agent framework. On a 500-sample subset of GSM8K, the approach improves accuracy over standard ToT by 0.6-8.8 percentage points across four LLMs, with larger gains for weaker models. The method is computationally expensive, requiring ~20 API calls per Reasoner agent, with token usage increasing from 256 (CoT) to 4000+ (ToT) per question.",
    384   "red_flags": [
    385     {
    386       "flag": "No error bars or multiple runs",
    387       "detail": "All results are single point estimates on a 500-sample subset with no variance reporting, making it impossible to assess whether differences are statistically meaningful."
    388     },
    389     {
    390       "flag": "Compute-unfair comparison",
    391       "detail": "MA-ToT+Validator uses orders of magnitude more compute (multiple agents × ~20 API calls each + validation + potential iterative rounds) than baselines (single call for IO/CoT). The improvement may simply be due to more compute, not the specific method."
    392     },
    393     {
    394       "flag": "Single benchmark evaluation",
    395       "detail": "Claims about 'improving LLM reasoning' are based solely on GSM8K arithmetic reasoning. No evaluation on other reasoning types."
    396     },
    397     {
    398       "flag": "No ablation study",
    399       "detail": "Cannot determine whether gains come from multi-agent voting, the ToT component, the Validator, or the iterative refinement. Missing: ToT-only multi-agent without Validator, Validator without ToT, etc."
    400     },
    401     {
    402       "flag": "Benchmark contamination risk",
    403       "detail": "GSM8K (2021) is likely in the training data of all four models tested. High baseline scores (91-93% for stronger models) may reflect memorization rather than reasoning."
    404     }
    405   ],
    406   "cited_papers": [
    407     {
    408       "title": "Training verifiers to solve math word problems",
    409       "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian", "Mark Chen"],
    410       "arxiv_id": "2110.14168",
    411       "relevance": "Introduces GSM8K benchmark used for evaluation; relevant to LLM reasoning evaluation methodology."
    412     },
    413     {
    414       "title": "Improving factuality and reasoning in language models through multiagent debate",
    415       "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba", "Joshua B. Tenenbaum", "Igor Mordatch"],
    416       "arxiv_id": "2305.14325",
    417       "relevance": "Core multi-agent debate approach for improving LLM reasoning, directly relevant to agentic AI methodology."
    418     },
    419     {
    420       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    421       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"],
    422       "year": 2023,
    423       "arxiv_id": "2305.10601",
    424       "relevance": "Foundational ToT method that this paper extends; key technique for structured LLM reasoning."
    425     },
    426     {
    427       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    428       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    429       "arxiv_id": "2201.11903",
    430       "relevance": "Foundational CoT prompting technique for LLM reasoning, used as baseline."
    431     },
    432     {
    433       "title": "Large language model based multi-agents: A survey of progress and challenges",
    434       "authors": ["Taicheng Guo", "Xiuying Chen", "Yaqi Wang"],
    435       "year": 2024,
    436       "relevance": "Survey of multi-agent LLM systems, directly relevant to agentic AI survey scope."
    437     },
    438     {
    439       "title": "Counterfactual debating with preset stances for hallucination elimination of LLMs",
    440       "authors": ["Yi Fang", "Moxin Li", "Wenjie Wang"],
    441       "arxiv_id": "2406.11514",
    442       "relevance": "Multi-agent debate framework for mitigating LLM hallucinations, relevant to AI safety and agentic systems."
    443     },
    444     {
    445       "title": "HaluEval: A large-scale hallucination evaluation benchmark for large language models",
    446       "authors": ["Junyi Li", "Xiaoxue Cheng", "Wayne Xin Zhao"],
    447       "arxiv_id": "2305.11747",
    448       "relevance": "Hallucination evaluation benchmark for LLMs, relevant to AI evaluation methodology."
    449     },
    450     {
    451       "title": "Towards CausalGPT: A multi-agent approach for faithful knowledge reasoning via promoting causal consistency in LLMs",
    452       "authors": ["Ziyi Tang", "Ruilin Wang", "Weixing Chen"],
    453       "arxiv_id": "2308.11914",
    454       "relevance": "Multi-agent approach for reasoning verification in LLMs."
    455     },
    456     {
    457       "title": "The llama 3 herd of models",
    458       "authors": ["Abhimanyu Dubey"],
    459       "year": 2024,
    460       "arxiv_id": "2407.21783",
    461       "relevance": "Describes Llama 3.1 models used in experiments; relevant to LLM capability evaluation."
    462     }
    463   ]
    464 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs