ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30231B)


      1 {
      2   "paper": {
      3     "title": "Do We Truly Need So Many Samples? Multi-LLM Repeated Sampling Efficiently Scales Test-Time Compute",
      4     "authors": [
      5       "Jianhao Chen",
      6       "Zishuo Xun",
      7       "Bocheng Zhou",
      8       "Han Qi",
      9       "Hangfan Zhang",
     10       "Qiaosheng Zhang",
     11       "Yang Chen",
     12       "Wei Hu",
     13       "Yuzhong Qu",
     14       "Wanli Ouyang",
     15       "Shuyue Hu"
     16     ],
     17     "year": 2025,
     18     "venue": "arXiv.org",
     19     "arxiv_id": "2504.00762",
     20     "doi": "10.48550/arXiv.2504.00762"
     21   },
     22   "scan_version": 3,
     23   "active_modules": ["experimental_rigor", "data_leakage"],
     24   "methodology_tags": ["benchmark-eval", "theoretical"],
     25   "key_findings": "ModelSwitch, a multi-LLM repeated-sampling-then-voting strategy that dynamically switches between models based on answer consistency, outperforms single-LLM self-consistency and multi-agent debate methods across seven benchmarks while reducing average sampling cost by 34%. On MMLU-Pro, ModelSwitch achieves 63.2% accuracy, surpassing the best single LLM by 10.2 points and all debate-based methods. The paper provides theoretical analysis showing that mixing models with complementary error distributions can yield correct answers even when neither model individually succeeds, with expected sampling reduction of 1/(n×c) where n is the number of models.",
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper states 'Code and data are available at https://github.com/JianhaoChen-nju/ModelSwitch' on page 1."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The paper states code and data are available at the GitHub link. Additionally, all seven benchmarks used (GSM8K, MATH, MathBench, MGSM, DATE, MMLU-Pro, AIME24) are publicly available datasets."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Appendix A.1 mentions the compute cluster (Ubuntu 22.04, 8 NVIDIA A100 GPUs, 80GB VRAM, 1600GB memory) but provides no requirements.txt, Dockerfile, or detailed library version specifications needed to recreate the environment."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README content, commands, or scripts for replicating experiments are described."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "All main results are reported as point estimates (e.g., '81% accuracy', '63.2% accuracy'). No confidence intervals or error bars appear in any of the figures or tables for the primary performance claims."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper claims ModelSwitch 'outperforms' baselines based solely on comparing point accuracy numbers (e.g., Figure 4, Figure 5) without any statistical significance tests. Correlation analyses in Section 2 report r and p-values, but no significance tests are applied to the main comparative claims."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper reports absolute accuracy differences with baselines (e.g., '10.2-point increase over best single LLM' on MMLU-Pro), percentage cost savings ('34% samples on average'), and efficiency ratios ('14× more efficient'). Baseline values are always provided for context."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No justification is given for dataset sizes or for the choice of 500 randomly selected MMLU-Pro questions. The sampling budget of 16 is used as a convention but not justified."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. Results appear to be from single experimental runs with stochastic sampling (temperature=1) but no repeated-run statistics."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Extensive baselines: self-consistency for individual LLMs (GPT-4o mini, Gemini 1.5 Flash), stronger single models (GPT-4o, Gemini 1.5 Pro), and multi-agent methods (MAD, AgentVerse, ChatEval, MOA). Section 4.2-4.3."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Baselines include MAD (ICML 2024), AgentVerse (ICLR 2024), ChatEval (ICLR 2024), MOA (2024), and self-consistency (ICLR 2023). These represent recent state-of-the-art multi-agent and multi-LLM approaches."
     86       },
     87       "ablation_study": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Table 4 provides an ablation study of the weighted voting algorithm, removing internal weights, external weights, and both. Section 4.4 ablates the number and order of LLMs."
     91       },
     92       "multiple_metrics": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The paper reports both accuracy (efficacy) and average actual sampling count per query (efficiency), as well as API cost comparisons (Table 3). These capture different dimensions of performance."
     96       },
     97       "human_evaluation": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No human evaluation is included. All evaluation is automated via answer matching on benchmark datasets with known correct answers."
    101       },
    102       "held_out_test_set": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Standard test splits are used for all seven benchmarks (e.g., GSM8K test set of 1,319 questions, MATH 500, MathBench Arith 300). These are established test sets."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Results are broken down across all seven individual datasets (Figures 4, 5, 6, 7) and across different sampling budgets, providing per-dataset performance rather than only aggregate numbers."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "No specific failure case analysis is provided. The paper does not examine individual questions where ModelSwitch fails or analyze error patterns."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 4.4 reports that scaling from 2 to 6 models can hurt performance (78.6%→76.4% on DATE under strong-to-weak). The paper notes 'simply increasing the number of models does not continuously enhance performance.'"
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The abstract claims are supported: outperforms self-consistency (Figure 4), outperforms multi-agent debate (Figure 5), reduces inference costs (Table 3), requires only few comparable LLMs (Figure 6), can be extended with verification (Figure 7), and theoretical analysis is provided (Section 5)."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper's main causal claims ('ModelSwitch improves performance') are supported by controlled ablation studies (Table 4 removes individual components) and Section 4.4 systematically varies the number and order of models. Theoretical analysis in Section 5 provides formal conditions. The ablation design adequately isolates component contributions."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper claims applicability to 'various reasoning and knowledge-based tasks' (Conclusion) and the title implies broad test-time compute scaling. However, 5 of 7 benchmarks are math-focused (GSM8K, MATH, MathBench, MGSM, AIME24) with only DATE (symbolic reasoning) and MMLU-Pro (multitask) outside mathematics. The generalization to non-mathematical domains is not bounded."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No alternative explanations are discussed. The paper does not consider confounds such as whether the improvements are specific to the particular model pairs chosen, whether the benchmarks have properties that particularly favor multi-model voting, or whether the consistency signal is merely a proxy for question difficulty."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper's claims match its measurements: it measures accuracy on benchmarks and frames it as accuracy; it measures sampling counts and frames it as efficiency. No proxy gap exists between what is measured and what is claimed."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Models are identified by marketing names: 'GPT-4o mini', 'Gemini 1.5 Flash', 'Claude 3 Haiku', without snapshot dates or API versions. Open-source models are specified more precisely (e.g., 'Llama-3.1-8B-Instruct', 'Qwen2.5-7B-Instruct') but closed-source models lack version specificity."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "The paper states 'All our queries are asked in COT format by default' (Section 4.2) but does not provide the actual prompt text or chain-of-thought prompt templates used."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "Only GPT-4o mini's hyperparameters are specified: 'temperature and top_p of GPT-4o mini to 1' (Appendix A.1). All other LLMs 'were kept with their default hyperparameters' without stating what those defaults are."
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "No agentic scaffolding is used. The method is a sampling and voting algorithm without agent loops, tool use, or scaffolding."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Appendix A.1 describes each dataset with sample counts and subsets used (e.g., MathBench Arith subset with 300 questions, MGSM 10 non-English languages with 1,000 samples, MMLU-Pro random 500-question subset)."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No dedicated limitations or threats-to-validity section exists in the paper. The conclusion mentions the approach is 'practical and generalizable' without discussing any limitations."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No specific threats to validity are discussed anywhere in the paper."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No explicit scope boundaries are stated. The paper does not describe what the results do NOT show, what settings or tasks are excluded, or what claims the authors are not making."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The paper states 'Code and data are available at https://github.com/JianhaoChen-nju/ModelSwitch.' All benchmarks used are also publicly available."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Data collection is straightforward: standard benchmarks are used. Appendix A.1 describes each benchmark, its size, and how subsets were selected (e.g., random 500 from MMLU-Pro, 10 non-English languages from MGSM)."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants. Data consists of standard public benchmarks."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Algorithm 1 describes the full ModelSwitch pipeline. The experimental setup (Section 4.1) describes how queries are processed, how answers are extracted, and how voting is performed."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No funding or acknowledgments section is present in the paper. Authors are from multiple institutions (Nanjing University, Shanghai AI Lab, University of Auckland, Penn State) but no funding sources are disclosed."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "All author affiliations are clearly listed on the first page. Authors are from academic institutions and Shanghai AI Laboratory. They evaluate third-party models (GPT-4o, Gemini, Claude) rather than their own products."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No funding is disclosed, making it impossible to assess funder independence. Shanghai AI Laboratory affiliations could represent institutional interest in showing multi-LLM approaches work."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests or financial interests statement is included in the paper."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No training data cutoff dates are stated for any of the models used (GPT-4o mini, Gemini 1.5 Flash, Claude 3 Haiku, or any open-source models)."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No discussion of potential train/test overlap. GSM8K, MATH, and other benchmarks have been publicly available for years and could be in the training data of models used."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "Several benchmarks (GSM8K published 2021, MATH 2021) were available before the training of models like GPT-4o mini and Gemini 1.5 Flash. No contamination analysis or discussion is provided."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study. All experiments are automated benchmark evaluations."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants in this study."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Table 3 reports actual API costs in dollars for each dataset and method. The paper also reports cost reduction percentages (15%-48% across datasets) and average sampling count reductions (34% on average)."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Appendix A.1 describes compute resources (Ubuntu 22.04, 8 NVIDIA A100 GPUs, 80GB VRAM each, 1600GB memory). Table 3 provides total API costs. Minimum hardware requirements for different model sizes are also stated."
    302       }
    303     },
    304     "experimental_rigor": {
    305       "seed_sensitivity_reported": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results are reported as single runs with stochastic sampling (temperature=1) but no repeated-seed evaluation."
    309       },
    310       "number_of_runs_stated": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The number of experimental runs is not explicitly stated. The sampling budget K is stated per query, but whether the entire experiment was repeated multiple times is not specified."
    314       },
    315       "hyperparameter_search_budget": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "External weights Wβ (Tables 1-2) appear tuned per dataset but no search budget, search method, or number of configurations tried is reported."
    319       },
    320       "best_config_selection_justified": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The Wβ values in Tables 1 and 2 are presented without justification for how they were selected. No validation set selection procedure or search methodology is described."
    324       },
    325       "multiple_comparison_correction": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The paper makes many comparisons across 7 datasets and 6+ methods without any correction for multiple comparisons or even basic significance testing."
    329       },
    330       "self_comparison_bias_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The authors implement ModelSwitch and compare against other methods without acknowledging the bias of evaluating their own system. No independent evaluation or discussion of this bias."
    334       },
    335       "compute_budget_vs_performance": {
    336         "applies": true,
    337         "answer": true,
    338         "justification": "Figures 4 and 7 explicitly show performance as a function of sampling budget (1 to 16). Table 3 compares costs at matched budgets. The comparison is always at matched total sampling budget."
    339       },
    340       "benchmark_construct_validity": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether the benchmarks actually measure the claimed capabilities. The paper uses GSM8K, MATH, etc. without questioning whether accuracy on these benchmarks reflects the broader 'reasoning and knowledge-based' capabilities claimed."
    344       },
    345       "scaffold_confound_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "No scaffolding is involved. The method is a sampling and voting strategy applied directly to model outputs."
    349       }
    350     },
    351     "data_leakage": {
    352       "temporal_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "Not addressed. Several benchmarks (GSM8K, MATH) were published years before the models' training periods, creating potential temporal leakage."
    356       },
    357       "feature_leakage_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "Not addressed. The evaluation uses direct question-answering so feature leakage risk is lower, but the paper does not discuss this."
    361       },
    362       "non_independence_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "Not addressed. No discussion of whether training data for the evaluated models contains the benchmark questions or structurally similar problems."
    366       },
    367       "leakage_detection_method": {
    368         "applies": true,
    369         "answer": false,
    370         "justification": "No leakage detection or prevention methods are used. No canary strings, membership inference tests, or decontamination pipelines."
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "ModelSwitch outperforms single-LLM self-consistency across all seven benchmarks while reducing average sampling count by 34%.",
    377       "evidence": "Figure 4 shows ModelSwitch consistently outperforming self-consistency for both GPT-4o mini and Gemini 1.5 Flash across 6 datasets. Average actual sampling counts are 9.2-13.4 out of budget 16, saving 34% on average (Section 4.2).",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "ModelSwitch achieves 63.2% accuracy on MMLU-Pro, surpassing the best single LLM by 10.2 points and all multi-agent debate methods.",
    382       "evidence": "Figure 5 shows ModelSwitch at 63.2% vs. best single LLM at 53%, MAD at 47.6%, AgentVerse at 44.8%, ChatEval at 44%, MOA at 52.6% (Section 4.3).",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Only a few comparable LLMs are needed to achieve optimal ModelSwitch performance; adding more models does not continuously improve results.",
    387       "evidence": "Figure 6 shows that scaling from 1→2 models provides the biggest gain, while 2→6 models shows plateau or decline (e.g., 78.6%→76.4% on DATE). Section 4.4.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "There is a universal correlation between consistency (entropy) and accuracy across LLMs and datasets.",
    392       "evidence": "Figure 2 shows moderate to high correlations (r = 0.61 to 0.96, all p < 0.001) between entropy and accuracy for 6 LLMs on MATH and MathBench with 16 samples per query (Section 2).",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "ModelSwitch can be combined with reward models (RM-BoN) for further performance improvement.",
    397       "evidence": "Figure 7 shows ModelSwitch + RM-BoN achieving 84% on both MATH and MathBench with 16 samples, vs. 82% and 81.33% for best single LLM + RM-BoN (Section 4.5).",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "The expected number of samplings can be reduced by a factor 1/(n×c) where n is the number of models and c is the consistent-answer probability.",
    402       "evidence": "Theoretical bound derived in Section 5, Equation 3. The bound holds under the assumption that each model's probability of producing a consistent answer list exceeds constant c > 0.",
    403       "supported": "strong"
    404     }
    405   ],
    406   "red_flags": [
    407     {
    408       "flag": "No error bars or uncertainty quantification",
    409       "detail": "All main results (Figures 4, 5, 6, 7) report single point estimates with no confidence intervals, error bars, or variance across runs. Given stochastic sampling (temperature=1), results could vary substantially across repetitions."
    410     },
    411     {
    412       "flag": "Heavy math benchmark bias",
    413       "detail": "5 of 7 benchmarks (GSM8K, MATH, MathBench, MGSM, AIME24) are mathematics-focused. Only DATE (symbolic reasoning) and MMLU-Pro (multitask) test non-mathematical domains. Claims of generality to 'various reasoning and knowledge-based tasks' are not well-supported."
    414     },
    415     {
    416       "flag": "Unexplained hyperparameter tuning",
    417       "detail": "External weights Wβ (Tables 1-2) vary across datasets and models (values of 1, 1.5, or 2) with no explanation of how they were selected, whether a validation set was used, or how sensitive results are to these choices."
    418     },
    419     {
    420       "flag": "No contamination analysis",
    421       "detail": "Several benchmarks (GSM8K, MATH published 2021) have been publicly available for years and may be in the training data of GPT-4o mini, Gemini 1.5 Flash, and other models. No contamination analysis is performed."
    422     },
    423     {
    424       "flag": "Missing limitations section",
    425       "detail": "The paper has no limitations, threats-to-validity, or scope boundaries section. No discussion of when or why ModelSwitch might fail, or what settings it is not applicable to."
    426     },
    427     {
    428       "flag": "Claims rated moderate despite impressive numbers",
    429       "detail": "The MMLU-Pro result (63.2% vs. 53% best single LLM) is a 10.2pp improvement but reported without statistical tests or repeated runs. With stochastic sampling and only 500 randomly-selected questions, the true difference could be smaller."
    430     }
    431   ],
    432   "cited_papers": [
    433     {
    434       "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective Than Scaling Model Parameters",
    435       "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    436       "year": 2024,
    437       "arxiv_id": "2408.03314",
    438       "relevance": "Core reference on test-time compute scaling as an alternative to parameter scaling, which ModelSwitch builds upon."
    439     },
    440     {
    441       "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling",
    442       "authors": ["Bradley Brown", "Jordan Juravsky", "Ryan Ehrlich"],
    443       "year": 2024,
    444       "arxiv_id": "2407.21787",
    445       "relevance": "Demonstrates that scaling repeated sampling consistently improves coverage of correct answers, motivating the sample efficiency problem this paper addresses."
    446     },
    447     {
    448       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    449       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    450       "year": 2023,
    451       "relevance": "The foundational self-consistency method that ModelSwitch extends to the multi-LLM setting."
    452     },
    453     {
    454       "title": "Improving Factuality and Reasoning in Language Models through Multiagent Debate",
    455       "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba"],
    456       "year": 2024,
    457       "relevance": "Multi-agent debate baseline (MAD) that ModelSwitch outperforms on multiple benchmarks."
    458     },
    459     {
    460       "title": "AgentVerse: Facilitating Multi-Agent Collaboration and Exploring Emergent Behaviors",
    461       "authors": ["Weize Chen", "Yusheng Su", "Jingwei Zuo"],
    462       "year": 2024,
    463       "relevance": "Multi-agent collaboration framework used as a baseline, outperformed by ModelSwitch."
    464     },
    465     {
    466       "title": "ChatEval: Towards Better LLM-Based Evaluators through Multi-Agent Debate",
    467       "authors": ["Chi-Min Chan", "Weize Chen", "Yusheng Su"],
    468       "year": 2024,
    469       "relevance": "Multi-agent debate evaluation system used as a baseline in the multi-agent comparison."
    470     },
    471     {
    472       "title": "Mixture-of-Agents Enhances Large Language Model Capabilities",
    473       "authors": ["Junlin Wang", "Jue Wang", "Ben Athiwaratkun"],
    474       "year": 2024,
    475       "arxiv_id": "2406.04692",
    476       "relevance": "MOA architecture that aggregates samples from multiple LLMs, serving as the strongest multi-LLM baseline."
    477     },
    478     {
    479       "title": "If Multi-Agent Debate is the Answer, What is the Question?",
    480       "authors": ["Hangfan Zhang", "Zhiyao Cui", "Xinrun Wang"],
    481       "year": 2025,
    482       "arxiv_id": "2502.08788",
    483       "relevance": "Questions when multi-agent debate helps, finding that different LLMs excel at different questions—supporting ModelSwitch's complementary strengths hypothesis."
    484     },
    485     {
    486       "title": "Can 1B LLM Surpass 405B LLM? Rethinking Compute-Optimal Test-Time Scaling",
    487       "authors": ["Runze Liu", "Junqi Gao", "Jian Zhao"],
    488       "year": 2025,
    489       "arxiv_id": "2502.06703",
    490       "relevance": "Explores test-time compute scaling to bridge model size gaps, directly related to this paper's claim of achieving large-model performance with smaller models."
    491     },
    492     {
    493       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    494       "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"],
    495       "year": 2025,
    496       "arxiv_id": "2501.12948",
    497       "relevance": "Reasoning-focused LLM used in the extended experiments with reasoning models (Appendix A.2)."
    498     },
    499     {
    500       "title": "RouteLLM: Learning to Route LLMs from Preference Data",
    501       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"],
    502       "year": 2024,
    503       "relevance": "Model routing approach that trains routers to distribute queries to specialized models, conceptually related to ModelSwitch's training-free routing via consistency."
    504     },
    505     {
    506       "title": "More Agents Is All You Need",
    507       "authors": ["Junyou Li", "Qin Zhang", "Yangbin Yu"],
    508       "year": 2024,
    509       "arxiv_id": "2402.05120",
    510       "relevance": "Demonstrates that scaling the number of sampling agents improves performance, directly related to the repeated sampling paradigm."
    511     }
    512   ],
    513   "engagement_factors": {
    514     "practical_relevance": {
    515       "score": 2,
    516       "justification": "The method is simple to implement (multi-API sampling + voting) and reduces API costs by 34%, making it immediately applicable to practitioners using multiple LLM providers."
    517     },
    518     "surprise_contrarian": {
    519       "score": 1,
    520       "justification": "The finding that two weak models can outperform strong models via simple mixing is mildly surprising but the complementary strengths intuition is not deeply contrarian."
    521     },
    522     "fear_safety": {
    523       "score": 0,
    524       "justification": "No safety, security, or risk implications discussed or relevant."
    525     },
    526     "drama_conflict": {
    527       "score": 0,
    528       "justification": "No controversy or conflict. Standard benchmark comparison paper."
    529     },
    530     "demo_ability": {
    531       "score": 2,
    532       "justification": "Code is released on GitHub. The method is simple enough to implement independently, though it requires multiple API accounts."
    533     },
    534     "brand_recognition": {
    535       "score": 1,
    536       "justification": "Shanghai AI Laboratory has moderate recognition. The paper uses well-known models (GPT-4o, Gemini, Claude) but is not from those labs."
    537     }
    538   }
    539 }

Impressum · Datenschutz