scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19014B)
      1 {
      2   "paper": {
      3     "title": "CoT-based Synthesizer: Enhancing LLM Performance through Answer Synthesis",
      4     "authors": ["Bohan Zhang", "Xiaokang Zhang", "Jing Zhang", "Jifan Yu", "Sijia Luo", "Jie Tang"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2501.01668"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository URL provided: https://github.com/RUCKBReasoning/CoT-based-Synthesizer (Section 1, footnote 1)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses public benchmarks (GSM8k, MATH500, WikiTQ, FeTaQA). The abstract states 'training data and code are publicly available on the repository.' Appendix A mentions release under CC BY-SA 4.0 upon acceptance."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Appendix C.1 specifies Ubuntu 22.04, PyTorch 2.4.0, 8 NVIDIA A800 80GB GPUs, Intel Xeon Platinum 8358, 2048GB RAM, and mentions Transformers and vLLM."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repo is referenced but the paper itself does not include a 'Reproducing Results' section or specific commands."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Despite averaging over 3 runs, no confidence intervals or error bars are reported. All tables show point estimates only."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims their method 'significantly improves' and 'outperforms' baselines but provides no statistical significance tests (no p-values, t-tests, etc.)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Percentage improvements over baselines are reported with context, e.g., '11.8% for Llama3-8B and 10.3% for GPT-4o on MATH500' and detailed improvement columns in Tables 2 and 4."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for why 3 runs were chosen, no power analysis. The choice of benchmarks and number of policy models is not justified."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Each experiment is 'conducted three times' and 'reported results are the average' (Section 5.1), but no standard deviation, variance, or spread measures are reported."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Multiple baselines included: CoT-prompting, Self-consistency, USC, ArmoRM, Scalar RM, and LMCOR (Section 5.1)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include recent methods: ArmoRM (2024), USC (2023), LMCOR (2023), using recent models like GPT-4o and Qwen2.5."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 5.3 ablates CoT training and the data generation pipeline. Appendix B ablates LLM Repair and Response LLM Sampling (Tables 2 and 4)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Exact Match (EM) accuracy for GSM8k, MATH500, WikiTQ, and Rouge-L for FeTaQA (Section 5.1)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of synthesized outputs. All evaluation is automated via EM or Rouge-L."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Training is on MATH and WikiTQ training sets; evaluation is on separate test sets. GSM8k and FeTaQA are unseen during training, serving as transfer tests."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 1 provides per-benchmark, per-model breakdowns. Table 3 provides per-correct-count breakdowns."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 3 analyzes performance by number of correct candidates. Figures 5 and 6 show qualitative examples. The Limitations section discusses failure modes with grouping."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 2 shows ablation variants that hurt performance (w/o training degrades on GSM8k by -1.8 avg; w/o CoT training degrades on some models). Some models show negative deltas."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of 11.8% for Llama3-8B and 10.3% for GPT-4o on MATH500 are supported by Table 1 (24.2→36.0 and 62.5→72.8). Claims of outperforming SC and BoN are supported."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about component contributions are supported by controlled ablation studies (Tables 2 and 4) that isolate individual components."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title and abstract claim general 'LLM Performance' enhancement, but results are limited to mathematical reasoning and table QA tasks. No explicit bounding of generalization scope."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations for the improvements. No threats-to-validity section addressing confounds."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Table 6 lists specific model versions with HuggingFace URLs: GPT-4o-2024-0513, Llama3-8B-Instruct, Llama-3.1-70B-Instruct, Qwen2-7B-Instruct, Qwen2.5-14B-Instruct, etc."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Appendix E provides full prompt text for Synthesizer inference, LLM Repair, and CoT-prompting for each dataset (MATH500, GSM8k, TableQA)."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Table 5 reports temperature=0.9, Top-P=0.9, Max Tokens=1024. Appendix C.3 reports LR=2e-6, weight decay=1e-2, batch size=128, 2 epochs, BF16, max length 4096."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The method is a single-pass synthesis pipeline, not an agent with tools, retries, or feedback loops."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.2 describes the data pipeline: candidate generation, filtering via gold answer comparison (math) or CritiqueLLM scoring ≥8 (TableQA), LLM Repair stage. Dataset sizes documented in Table 7."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "A dedicated 'Limitations' section discusses grouping constraints due to input length and inference overhead."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The Limitations section discusses only practical limitations (input length, inference overhead), not threats to validity of the experimental conclusions."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit statement about what the results do NOT show. The paper does not bound its claims to the tested task types (math, table QA) or model families."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Benchmarks used (GSM8k, MATH, WikiTQ, FeTaQA) are publicly available. Training data and code are stated to be released on GitHub."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4.2 describes the full data generation pipeline including sampling parameters, filtering criteria, and repair process. Table 7 shows dataset sizes."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data comes from standard benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4.2 and Appendix C.3 document the pipeline: 50 samplings per MATH sample, filtering by gold answer, LLM Repair with 20 additional samplings. 12k→295k MATH, 18k→87k WikiTQ."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgments section lists National Key Research & Develop Plan (2023YFF0725100) and NSFC grants."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations clearly listed: Renmin University of China, Tsinghua University. One author interned at Zhipu AI (footnote)."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funding is from Chinese government research programs (NSFC, National Key R&D Plan), which have no financial stake in the specific results."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement. One author interned at Zhipu AI (which produces GLM-4-Plus, one of the evaluated models), but no conflict disclosure."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training cutoff dates stated for any models used (GPT-4o, GLM-4-Plus, Llama, Qwen)."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether the policy models may have seen the benchmark test data during pretraining."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "GSM8k (2021), MATH (2021), WikiTQ (2015), FeTaQA (2022) are all publicly available before the training cutoffs of the models used. No contamination discussion."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The Limitations section acknowledges inference overhead but does not quantify it. No API costs, tokens consumed, or wall-clock time reported."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Hardware is specified (8 A800 GPUs) but no total GPU hours, training time, or API costs are reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CoT-based Synthesizer achieves 11.8% improvement for Llama3-8B and 10.3% for GPT-4o on MATH500",
    286       "evidence": "Table 1: Llama3-8B goes from 24.2 (CoT-prompting) to 36.0 (Synthesizer-8B); GPT-4o from 62.5 to 72.8.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "The method can synthesize correct answers even when all candidate responses are incorrect",
    291       "evidence": "Table 3 shows Synthesizer-8B produces 9 correct answers when correct count=0, while SC/ArmoRM/Scalar RM produce 0. Figure 5 provides a qualitative example.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Synthesizer-8B generalizes to unseen datasets (GSM8k, FeTaQA not in training data)",
    296       "evidence": "Table 1 shows improvements on GSM8k (+3.4 avg) and FeTaQA (+3.2 avg) despite not being in training data.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "The method consistently outperforms baselines across all benchmarks",
    301       "evidence": "Table 1 shows the method achieves highest or second-highest average across all 4 benchmarks, but does not always win per individual model.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Performance improves in a log-linear relationship with training data size",
    306       "evidence": "Figure 3 shows the trend on MATH and GSM8k, but only visual evidence from a figure with no formal fit statistics.",
    307       "supported": "weak"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "CoT-based Synthesizer is a novel inference scaling strategy that synthesizes answers from multiple LLM candidate responses using chain-of-thought reasoning, rather than selecting from candidates. A small 8B-parameter synthesizer trained on automatically generated data improves performance of larger models including GPT-4o across math reasoning and table QA benchmarks. The method uniquely produces correct answers even when all candidates are wrong (9/500 cases on MATH500 with Llama3-8B). Performance scales log-linearly with training data and consistently with number of inference candidates, unlike reward-model-based methods that degrade at high candidate counts.",
    312   "red_flags": [
    313     {
    314       "flag": "No variance despite multiple runs",
    315       "detail": "Each experiment is averaged over 3 runs but no standard deviation or confidence intervals are reported, making it impossible to assess whether differences between methods are statistically meaningful."
    316     },
    317     {
    318       "flag": "No contamination analysis",
    319       "detail": "All benchmarks (GSM8k 2021, MATH 2021, WikiTQ 2015, FeTaQA 2022) predate the models used. No discussion of whether models saw test data during pretraining."
    320     },
    321     {
    322       "flag": "Undisclosed conflict of interest",
    323       "detail": "One author interned at Zhipu AI, which produces GLM-4-Plus (one of the evaluated models). No competing interests statement is provided."
    324     },
    325     {
    326       "flag": "Overbroad generalization claims",
    327       "detail": "Title and abstract claim general 'LLM Performance' enhancement but results are limited to math reasoning and table QA. No testing on code generation, open-ended QA, summarization, or other tasks."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    333       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    334       "year": 2022,
    335       "relevance": "Foundational work on chain-of-thought prompting that this paper builds upon for its synthesis strategy."
    336     },
    337     {
    338       "title": "Self-consistency improves chain of thought reasoning in language models",
    339       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    340       "year": 2022,
    341       "arxiv_id": "2203.11171",
    342       "relevance": "Key baseline method for inference scaling via majority voting on multiple LLM outputs."
    343     },
    344     {
    345       "title": "Universal self-consistency for large language model generation",
    346       "authors": ["Xinyun Chen"],
    347       "year": 2023,
    348       "arxiv_id": "2311.17311",
    349       "relevance": "Extends self-consistency to open-ended tasks using LLM-based voting; direct baseline in this paper."
    350     },
    351     {
    352       "title": "Small language models improve giants by rewriting their outputs",
    353       "authors": ["Giorgos Vernikos"],
    354       "year": 2023,
    355       "arxiv_id": "2305.13514",
    356       "relevance": "LMCOR synthesis baseline; demonstrates small models can improve larger model outputs."
    357     },
    358     {
    359       "title": "LLM-blender: Ensembling large language models with pairwise ranking and generative fusion",
    360       "authors": ["Dongfu Jiang", "Xiang Ren", "Bill Yuchen Lin"],
    361       "year": 2023,
    362       "arxiv_id": "2306.02561",
    363       "relevance": "Related ensemble method for combining LLM outputs via ranking and fusion."
    364     },
    365     {
    366       "title": "Training verifiers to solve math word problems",
    367       "authors": ["Karl Cobbe"],
    368       "year": 2021,
    369       "arxiv_id": "2110.14168",
    370       "relevance": "Introduces GSM8k benchmark and outcome-based verification for math reasoning; baseline method in this paper."
    371     },
    372     {
    373       "title": "An empirical analysis of compute-optimal inference for problem-solving with language models",
    374       "authors": ["Yangzhen Wu"],
    375       "year": 2024,
    376       "arxiv_id": "2408.00724",
    377       "relevance": "Studies compute-optimal inference scaling, directly relevant to the paper's inference scaling approach."
    378     },
    379     {
    380       "title": "Large language monkeys: Scaling inference compute with repeated sampling",
    381       "authors": ["Bradley Brown"],
    382       "year": 2024,
    383       "arxiv_id": "2407.21787",
    384       "relevance": "Studies scaling inference compute via repeated sampling, the paradigm this paper extends."
    385     },
    386     {
    387       "title": "Interpretable preferences via multi-objective reward modeling and mixture-of-experts",
    388       "authors": ["Haoxiang Wang"],
    389       "year": 2024,
    390       "arxiv_id": "2406.12845",
    391       "relevance": "ArmoRM reward model used as a Best-of-N baseline in this paper."
    392     }
    393   ]
    394 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs