ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24551B)


      1 {
      2   "paper": {
      3     "title": "Towards Reasoning in Large Language Models via Multi-Agent Peer Review Collaboration",
      4     "authors": ["Zhenran Xu", "Senbao Shi", "Baotian Hu", "Jindi Yu", "Dongfang Li", "Min Zhang", "Yuxiang Wu"],
      5     "year": 2023,
      6     "venue": "arXiv",
      7     "arxiv_id": "2311.08152",
      8     "doi": "10.48550/arXiv.2311.08152"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "A multi-agent peer review collaboration strategy outperforms single-agent and multi-agent debate baselines across all 10 reasoning benchmarks (math, commonsense, symbolic). Integrating confidence scores in reviews improves math reasoning but not commonsense/symbolic tasks due to LLM overconfidence. Feedback exchange is more effective than solution sharing, and models with small capability gaps but high diversity collaborate best.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "GitHub repository provided in the abstract: https://github.com/HITsz-TMG/Multi-agent-peer-review."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All 10 evaluation datasets are standard public benchmarks (GSM8K, SVAMP, AQuA, MultiArith, AddSub, SingleEq, ARC-c, StrategyQA, Colored Objects, Penguins)."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, requirements.txt, or dependency details provided. Only mentions using OpenAI API."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions in the paper. Implementation details in Appendix A describe settings but not how to run experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Tables 2-4 are single point estimates with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Claims of superiority ('outperforms all baselines across all datasets') are based solely on comparing accuracy numbers with no statistical tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Percentage improvements with baseline context provided, e.g., '+1.40% on GSM8K, +3.80% on SVAMP, +2.75% on AQuA over the prior state-of-the-art' (Section 4.4). Table 5 also reports absolute accuracy changes (e.g., +7.6%)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper uses a maximum of 500 samples per dataset 'due to the rate limits and a restricted budget' (Appendix A) but does not justify whether this sample size is sufficient for the claims made."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures reported across any experiments. All results appear to be single-run numbers."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Four baselines compared: Zero-shot CoT, Self-correct, Multi-agent Majority, and Multi-agent Debate (Section 4.2)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include contemporary methods: Self-correct (Huang et al., 2023), Multi-agent Debate (Du et al., 2023), all from 2023."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Two ablations: removing confidence scores (w/o confidence) and removing peer solutions (w/o solution), reported across all 10 datasets in Tables 2-4."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Only accuracy is reported as the evaluation metric across all experiments."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of reasoning quality or outputs. The paper does manually annotate 600 feedback samples for the confidence analysis (Section 4.5) but this analyzes feedback correctness, not system outputs."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Standard benchmark test sets are used. The paper uses up to 500 randomly selected samples from each dataset's established test/evaluation split."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results broken down across all 10 individual datasets in three categories (math, commonsense, symbolic) in separate tables (Tables 2-4)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Answer change analysis (Figure 3, Section 4.5) shows Correct→Incorrect cases. The confidence analysis in Figure 5 discusses overconfidence as a failure mode. Case study in Appendix E shows the full collaboration process including initial wrong answers."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Several negative results: confidence integration hurts on some non-math datasets (Tables 3-4), performance declines beyond 4 agents (Figure 4a), additional review rounds show no improvement (Figure 4b), stronger model does not benefit from collaboration with weaker model (Table 5, claude-2.1 drops -0.2%)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of 'superior accuracy across all ten datasets' are supported by Tables 2-4. Claims about confidence effectiveness and feedback superiority are supported by ablation results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims (peer review 'improves' reasoning) are supported by ablation studies that isolate confidence and solution components. The ablation design with controlled single-variable manipulation is adequate."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "Title says 'Towards Reasoning in Large Language Models' broadly, but results are only on gpt-3.5-turbo with 10 specific benchmarks. No testing on other model families as the primary setting. The cross-model analysis in Table 5 is limited to GPT-3.5 and Claude variants."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No discussion of alternative explanations for the improvements. Could the gains come from majority voting with more samples rather than the peer review mechanism? The paper does not disentangle the effect of additional compute/tokens from the review process."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures accuracy on reasoning benchmarks and claims improvements in reasoning accuracy — the measurement matches the claim granularity without broader framing."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific model versions stated: 'gpt-3.5-turbo-0613', 'gpt-3.5-turbo-16k-0613', 'claude-instant-1.2', 'claude-2.1' (Section 4.3)."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompt text for Stage 2 review and Stage 3 revision provided in Section 3. Role prompts listed in Appendix D. The actual instructions concatenated with solutions are quoted verbatim."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Appendix A states 'all parameters in default setting' but does not specify what those defaults are (temperature, top-p, etc.)."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The three-stage pipeline (Create, Review, Revise) is described in detail in Section 3 with full workflow, including how solutions and reviews are concatenated and passed between agents."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "Random sampling of up to 500 examples per dataset is mentioned but no details on how the random selection was performed or what seed was used."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "A dedicated 'Limitations' section discusses token consumption costs and the need for further research on agent collaboration factors."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "Limitations section only mentions generic concerns about token costs and that 'underlying factors for LLM collaboration are more than just capability and answer diversity.' No specific threats to validity of the reported results."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit statement of what the results do NOT show. Does not acknowledge that results are limited to gpt-3.5-turbo, to reasoning tasks, or to the specific benchmark subsets used."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental outputs (model responses, reviews, revised answers) are available for verification. Only aggregate accuracy numbers reported."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Data sources are well described — all 10 benchmarks are cited with statistics in Table 1 (domain, sample count, average words, answer type)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All data comes from standard public benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No documentation of how the 500-sample subsets were selected, or the pipeline from raw benchmark data to final accuracy numbers."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information or acknowledgments section found in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations clearly listed: Harbin Institute of Technology (Shenzhen) and University College London. No conflict with evaluated products (OpenAI GPT, Anthropic Claude)."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information disclosed, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement found in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No mention of the training data cutoff for gpt-3.5-turbo-0613 or the Claude models used."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether benchmark examples (GSM8K published 2021, ARC 2018, etc.) appeared in the training data of gpt-3.5-turbo."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Many benchmarks (GSM8K 2021, SVAMP 2021, ARC 2018) were published well before the model's training cutoff. No discussion of contamination risk."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The Limitations section mentions token consumption as a concern but does not quantify API costs, tokens consumed, or latency."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total compute budget stated. The paper mentions 'rate limits and a restricted budget' constraining sample sizes but does not quantify."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of random seeds or sensitivity analysis. Results appear to be from single runs."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Number of experimental runs not stated. No indication of whether results are from single or multiple runs."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Fixed 3 agents, 1 review round as defaults. No search budget reported for how these were selected."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The default configuration of 3 agents and 1 round is not justified. Figure 4a shows 4 agents is optimal but main results use 3. No explanation for this discrepancy."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests performed at all, let alone correction for multiple comparisons across 10 datasets."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Authors implement their own baselines (except Multi-agent Debate which uses official code) without acknowledging potential bias from re-implementation."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The proposed method uses 3x more API calls than single-agent baselines (3 agents × create + review + revise stages) but performance is not compared at matched compute budgets."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the 10 benchmarks actually measure 'reasoning' as claimed, or what aspects of reasoning they capture."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The multi-agent scaffolding IS the system, but comparisons with Multi-agent Debate use different scaffolds (different prompts, different interaction patterns) without isolating the scaffold effect from the method effect."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage. Benchmarks like GSM8K (2021) and ARC (2018) predate the model's training data."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information through peer solutions during the review process that would not be available in single-agent deployment."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of independence between training and test data for any of the 10 benchmarks."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Multi-agent peer review collaboration outperforms all single-agent and multi-agent baselines across all 10 reasoning datasets.",
    365       "evidence": "Tables 2-4 show accuracy improvements on all 10 datasets: +1.40% GSM8K, +3.80% SVAMP, +2.75% AQuA over prior SOTA (Section 4.4).",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Integrating confidence scores in reviews improves performance on mathematical reasoning tasks.",
    370       "evidence": "Table 2 shows consistent improvement with confidence on all 6 math datasets. Tables 3-4 show mixed/negative results on non-math tasks (Section 4.4 ablation).",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Feedback exchange is more effective than solution sharing in multi-agent collaboration.",
    375       "evidence": "'Ours (w/o solution)' outperforms Multi-agent Debate in 8/10 datasets (Section 4.4). Multi-agent Debate only shares solutions while w/o solution only shares feedback.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "LLMs with smaller capability gaps but higher diversity collaborate better.",
    380       "evidence": "Table 5 shows gpt-3.5-turbo variants (gap 2.8, diversity 35.4) improve mutually, while claude models (gap 2.2, diversity 15.8) show the stronger model declining (-0.2%). Section 4.5 analysis.",
    381       "supported": "weak"
    382     },
    383     {
    384       "claim": "LLMs exhibit significant overconfidence and miscalibration in verbalized confidence scores.",
    385       "evidence": "Figure 5 reliability diagrams show accuracy within each confidence bin is much lower than the confidence level, especially on non-math datasets. AUROC and ECE metrics reported for 600 manually annotated samples each from GSM8K and Penguins.",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "No error bars or statistical tests",
    392       "detail": "All comparisons across 10 datasets are made by comparing single point estimates. Claims of superiority are not statistically validated. Some differences are very small (e.g., +0.20% on AddSub, +0.40% on MultiArith)."
    393     },
    394     {
    395       "flag": "Uncontrolled compute budget",
    396       "detail": "The peer review method uses substantially more API calls than baselines (3 agents × 3 stages vs. single agent). Multi-agent Majority uses the same number of initial samples but without the review overhead. The comparison does not control for total tokens consumed."
    397     },
    398     {
    399       "flag": "Subsample without justification",
    400       "detail": "Experiments use max 500 randomly selected samples per dataset due to budget constraints. No analysis of whether this subsample is representative or sufficient for reliable comparison."
    401     },
    402     {
    403       "flag": "Benchmark contamination risk ignored",
    404       "detail": "Multiple benchmarks (ARC 2018, GSM8K 2021, SVAMP 2021) predate the model's training data. Absolute performance levels may be inflated by contamination, though relative comparisons may still hold."
    405     },
    406     {
    407       "flag": "Configuration inconsistency",
    408       "detail": "Figure 4a shows 4 agents gives best performance, but all main experiments use 3 agents. The paper uses gpt-3.5-turbo-16k for 5-agent and multi-round experiments, changing the model mid-analysis."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "Large language models cannot self-correct reasoning yet",
    414       "authors": ["Jie Huang", "Xinyun Chen", "Swaroop Mishra", "Huaixiu Steven Zheng", "Adams Wei Yu", "Xinying Song", "Denny Zhou"],
    415       "year": 2023,
    416       "relevance": "Key motivation paper showing LLMs cannot self-correct without external feedback, directly motivating the multi-agent approach."
    417     },
    418     {
    419       "title": "Improving factuality and reasoning in language models through multiagent debate",
    420       "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba", "Joshua B. Tenenbaum", "Igor Mordatch"],
    421       "year": 2023,
    422       "relevance": "Primary multi-agent baseline; explores debate-based multi-agent collaboration for reasoning."
    423     },
    424     {
    425       "title": "Self-refine: Iterative refinement with self-feedback",
    426       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    427       "year": 2023,
    428       "relevance": "Influential self-correction method that this paper argues is limited compared to multi-agent feedback."
    429     },
    430     {
    431       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    432       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    433       "year": 2022,
    434       "relevance": "Foundational prompting technique used as the base reasoning method in all experiments."
    435     },
    436     {
    437       "title": "Self-consistency improves chain of thought reasoning in language models",
    438       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans", "Quoc V Le"],
    439       "year": 2023,
    440       "relevance": "Majority voting baseline; key single-agent reasoning enhancement technique."
    441     },
    442     {
    443       "title": "CAMEL: Communicative agents for mind exploration of large language model society",
    444       "authors": ["Guohao Li"],
    445       "year": 2023,
    446       "relevance": "Cooperative agent framework for role-playing collaboration, related multi-agent work."
    447     },
    448     {
    449       "title": "Communicative agents for software development",
    450       "authors": ["Chen Qian", "Xin Cong", "Wei Liu"],
    451       "year": 2023,
    452       "relevance": "ChatDev: multi-agent collaboration applied to software development, extending role-playing paradigm."
    453     },
    454     {
    455       "title": "Corex: Pushing the boundaries of complex reasoning through multi-model collaboration",
    456       "authors": ["Qiushi Sun", "Zhangyue Yin", "Xiang Li"],
    457       "year": 2023,
    458       "relevance": "Explores three collaboration paradigms (debate, code review, retrieve) for reasoning — directly comparable approach."
    459     },
    460     {
    461       "title": "Encouraging divergent thinking in large language models through multi-agent debate",
    462       "authors": ["Tian Liang", "Zhiwei He", "Wenxiang Jiao"],
    463       "year": 2023,
    464       "relevance": "Shows LLMs cannot generate novel thoughts through self-reflection, motivating multi-agent approaches."
    465     },
    466     {
    467       "title": "Can llms express their uncertainty? An empirical evaluation of confidence elicitation in llms",
    468       "authors": ["Miao Xiong", "Zhiyuan Hu", "Xinyang Lu"],
    469       "year": 2023,
    470       "relevance": "Directly relevant to the confidence calibration analysis — shows LLM overconfidence and miscalibration."
    471     },
    472     {
    473       "title": "Reflexion: Language agents with verbal reinforcement learning",
    474       "authors": ["Noah Shinn", "Federico Cassano", "Edward Berman"],
    475       "year": 2023,
    476       "relevance": "Self-reflection agent approach for iterative improvement, related self-correction baseline."
    477     }
    478   ]
    479 }

Impressum · Datenschutz