scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33625B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Do We Truly Need So Many Samples? Multi-LLM Repeated Sampling Efficiently Scales Test-Time Compute",
      6     "authors": [
      7       "Jianhao Chen",
      8       "Zishuo Xun",
      9       "Bocheng Zhou",
     10       "Han Qi",
     11       "Hangfan Zhang",
     12       "Qiaosheng Zhang",
     13       "Yang Chen",
     14       "Wei Hu",
     15       "Yuzhong Qu",
     16       "Wanli Ouyang",
     17       "Shuyue Hu"
     18     ],
     19     "year": 2025,
     20     "venue": "arXiv.org",
     21     "arxiv_id": "2504.00762",
     22     "doi": "10.48550/arXiv.2504.00762"
     23   },
     24   "checklist": {
     25     "claims_and_evidence": {
     26       "abstract_claims_supported": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The abstract claims are supported: outperforms self-consistency (Figure 4), outperforms multi-agent debate (Figure 5), reduces inference costs (Table 3), requires only few comparable LLMs (Figure 6), can be extended with verification (Figure 7), and theoretical analysis is provided (Section 5).",
     30         "source": "opus"
     31       },
     32       "causal_claims_justified": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper's main causal claims ('ModelSwitch improves performance') are supported by controlled ablation studies (Table 4 removes individual components) and Section 4.4 systematically varies the number and order of models. Theoretical analysis in Section 5 provides formal conditions. The ablation design adequately isolates component contributions.",
     36         "source": "opus"
     37       },
     38       "generalization_bounded": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims applicability to 'various reasoning and knowledge-based tasks' (Conclusion) and the title implies broad test-time compute scaling. However, 5 of 7 benchmarks are math-focused (GSM8K, MATH, MathBench, MGSM, AIME24) with only DATE (symbolic reasoning) and MMLU-Pro (multitask) outside mathematics. The generalization to non-mathematical domains is not bounded.",
     42         "source": "opus"
     43       },
     44       "alternative_explanations_discussed": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No alternative explanations are discussed. The paper does not consider confounds such as whether the improvements are specific to the particular model pairs chosen, whether the benchmarks have properties that particularly favor multi-model voting, or whether the consistency signal is merely a proxy for question difficulty.",
     48         "source": "opus"
     49       },
     50       "proxy_outcome_distinction": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "The paper's claims match its measurements: it measures accuracy on benchmarks and frames it as accuracy; it measures sampling counts and frames it as efficiency. No proxy gap exists between what is measured and what is claimed.",
     54         "source": "opus"
     55       }
     56     },
     57     "limitations_and_scope": {
     58       "limitations_section_present": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No dedicated limitations or threats-to-validity section exists in the paper. The conclusion mentions the approach is 'practical and generalizable' without discussing any limitations.",
     62         "source": "opus"
     63       },
     64       "threats_to_validity_specific": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No specific threats to validity are discussed anywhere in the paper.",
     68         "source": "opus"
     69       },
     70       "scope_boundaries_stated": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No explicit scope boundaries are stated. The paper does not describe what the results do NOT show, what settings or tasks are excluded, or what claims the authors are not making.",
     74         "source": "opus"
     75       }
     76     },
     77     "conflicts_of_interest": {
     78       "funding_disclosed": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No funding or acknowledgments section is present in the paper. Authors are from multiple institutions (Nanjing University, Shanghai AI Lab, University of Auckland, Penn State) but no funding sources are disclosed.",
     82         "source": "opus"
     83       },
     84       "affiliations_disclosed": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "All author affiliations are clearly listed on the first page. Authors are from academic institutions and Shanghai AI Laboratory. They evaluate third-party models (GPT-4o, Gemini, Claude) rather than their own products.",
     88         "source": "opus"
     89       },
     90       "funder_independent_of_outcome": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No funding is disclosed, making it impossible to assess funder independence. Shanghai AI Laboratory affiliations could represent institutional interest in showing multi-LLM approaches work.",
     94         "source": "opus"
     95       },
     96       "financial_interests_declared": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No competing interests or financial interests statement is included in the paper.",
    100         "source": "opus"
    101       }
    102     },
    103     "scope_and_framing": {
    104       "key_terms_defined": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Key terms are defined: 'consistency' is formally defined as entropy of generated answers (Sec. 2), 'ModelSwitch' is specified in Algorithm 1, and the generation-verification paradigm is explained in context.",
    108         "source": "haiku"
    109       },
    110       "intended_contribution_clear": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Four explicit contributions are enumerated in Sec. 1: empirical correlation analysis, the ModelSwitch method, experimental results, and theoretical analysis.",
    114         "source": "haiku"
    115       },
    116       "engagement_with_prior_work": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Sec. 6 explicitly differentiates ModelSwitch from self-consistency, MAD, MOA, AgentVerse, ChatEval, and model routing, explaining mechanistic differences rather than just listing papers.",
    120         "source": "haiku"
    121       }
    122     }
    123   },
    124   "type_checklist": {
    125     "empirical": {
    126       "artifacts": {
    127         "code_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The paper states 'Code and data are available at https://github.com/JianhaoChen-nju/ModelSwitch' on page 1.",
    131           "source": "opus"
    132         },
    133         "data_released": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "The paper states code and data are available at the GitHub link. Additionally, all seven benchmarks used (GSM8K, MATH, MathBench, MGSM, DATE, MMLU-Pro, AIME24) are publicly available datasets.",
    137           "source": "opus"
    138         },
    139         "environment_specified": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Appendix A.1 mentions the compute cluster (Ubuntu 22.04, 8 NVIDIA A100 GPUs, 80GB VRAM, 1600GB memory) but provides no requirements.txt, Dockerfile, or detailed library version specifications needed to recreate the environment.",
    143           "source": "opus"
    144         },
    145         "reproduction_instructions": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README content, commands, or scripts for replicating experiments are described.",
    149           "source": "opus"
    150         }
    151       },
    152       "statistical_methodology": {
    153         "confidence_intervals_or_error_bars": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "All main results are reported as point estimates (e.g., '81% accuracy', '63.2% accuracy'). No confidence intervals or error bars appear in any of the figures or tables for the primary performance claims.",
    157           "source": "opus"
    158         },
    159         "significance_tests": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The paper claims ModelSwitch 'outperforms' baselines based solely on comparing point accuracy numbers (e.g., Figure 4, Figure 5) without any statistical significance tests. Correlation analyses in Section 2 report r and p-values, but no significance tests are applied to the main comparative claims.",
    163           "source": "opus"
    164         },
    165         "effect_sizes_reported": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "The paper reports absolute accuracy differences with baselines (e.g., '10.2-point increase over best single LLM' on MMLU-Pro), percentage cost savings ('34% samples on average'), and efficiency ratios ('14× more efficient'). Baseline values are always provided for context.",
    169           "source": "opus"
    170         },
    171         "sample_size_justified": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No justification is given for dataset sizes or for the choice of 500 randomly selected MMLU-Pro questions. The sampling budget of 16 is used as a convention but not justified.",
    175           "source": "opus"
    176         },
    177         "variance_reported": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. Results appear to be from single experimental runs with stochastic sampling (temperature=1) but no repeated-run statistics.",
    181           "source": "opus"
    182         }
    183       },
    184       "evaluation_design": {
    185         "baselines_included": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Extensive baselines: self-consistency for individual LLMs (GPT-4o mini, Gemini 1.5 Flash), stronger single models (GPT-4o, Gemini 1.5 Pro), and multi-agent methods (MAD, AgentVerse, ChatEval, MOA). Section 4.2-4.3.",
    189           "source": "opus"
    190         },
    191         "baselines_contemporary": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Baselines include MAD (ICML 2024), AgentVerse (ICLR 2024), ChatEval (ICLR 2024), MOA (2024), and self-consistency (ICLR 2023). These represent recent state-of-the-art multi-agent and multi-LLM approaches.",
    195           "source": "opus"
    196         },
    197         "ablation_study": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Table 4 provides an ablation study of the weighted voting algorithm, removing internal weights, external weights, and both. Section 4.4 ablates the number and order of LLMs.",
    201           "source": "opus"
    202         },
    203         "multiple_metrics": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "The paper reports both accuracy (efficacy) and average actual sampling count per query (efficiency), as well as API cost comparisons (Table 3). These capture different dimensions of performance.",
    207           "source": "opus"
    208         },
    209         "human_evaluation": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "No human evaluation is included. All evaluation is automated via answer matching on benchmark datasets with known correct answers.",
    213           "source": "opus"
    214         },
    215         "held_out_test_set": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Standard test splits are used for all seven benchmarks (e.g., GSM8K test set of 1,319 questions, MATH 500, MathBench Arith 300). These are established test sets.",
    219           "source": "opus"
    220         },
    221         "per_category_breakdown": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Results are broken down across all seven individual datasets (Figures 4, 5, 6, 7) and across different sampling budgets, providing per-dataset performance rather than only aggregate numbers.",
    225           "source": "opus"
    226         },
    227         "failure_cases_discussed": {
    228           "applies": true,
    229           "answer": false,
    230           "justification": "No specific failure case analysis is provided. The paper does not examine individual questions where ModelSwitch fails or analyze error patterns.",
    231           "source": "opus"
    232         },
    233         "negative_results_reported": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Section 4.4 reports that scaling from 2 to 6 models can hurt performance (78.6%→76.4% on DATE under strong-to-weak). The paper notes 'simply increasing the number of models does not continuously enhance performance.'",
    237           "source": "opus"
    238         }
    239       },
    240       "setup_transparency": {
    241         "model_versions_specified": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "Models are identified by marketing names: 'GPT-4o mini', 'Gemini 1.5 Flash', 'Claude 3 Haiku', without snapshot dates or API versions. Open-source models are specified more precisely (e.g., 'Llama-3.1-8B-Instruct', 'Qwen2.5-7B-Instruct') but closed-source models lack version specificity.",
    245           "source": "opus"
    246         },
    247         "prompts_provided": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "The paper states 'All our queries are asked in COT format by default' (Section 4.2) but does not provide the actual prompt text or chain-of-thought prompt templates used.",
    251           "source": "opus"
    252         },
    253         "hyperparameters_reported": {
    254           "applies": true,
    255           "answer": false,
    256           "justification": "Only GPT-4o mini's hyperparameters are specified: 'temperature and top_p of GPT-4o mini to 1' (Appendix A.1). All other LLMs 'were kept with their default hyperparameters' without stating what those defaults are.",
    257           "source": "opus"
    258         },
    259         "scaffolding_described": {
    260           "applies": false,
    261           "answer": false,
    262           "justification": "No agentic scaffolding is used. The method is a sampling and voting algorithm without agent loops, tool use, or scaffolding.",
    263           "source": "opus"
    264         },
    265         "data_preprocessing_documented": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "Appendix A.1 describes each dataset with sample counts and subsets used (e.g., MathBench Arith subset with 300 questions, MGSM 10 non-English languages with 1,000 samples, MMLU-Pro random 500-question subset).",
    269           "source": "opus"
    270         }
    271       },
    272       "data_integrity": {
    273         "raw_data_available": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "The paper states 'Code and data are available at https://github.com/JianhaoChen-nju/ModelSwitch.' All benchmarks used are also publicly available.",
    277           "source": "opus"
    278         },
    279         "data_collection_described": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "Data collection is straightforward: standard benchmarks are used. Appendix A.1 describes each benchmark, its size, and how subsets were selected (e.g., random 500 from MMLU-Pro, 10 non-English languages from MGSM).",
    283           "source": "opus"
    284         },
    285         "recruitment_methods_described": {
    286           "applies": false,
    287           "answer": false,
    288           "justification": "No human participants. Data consists of standard public benchmarks.",
    289           "source": "opus"
    290         },
    291         "data_pipeline_documented": {
    292           "applies": true,
    293           "answer": true,
    294           "justification": "Algorithm 1 describes the full ModelSwitch pipeline. The experimental setup (Section 4.1) describes how queries are processed, how answers are extracted, and how voting is performed.",
    295           "source": "opus"
    296         }
    297       },
    298       "contamination": {
    299         "training_cutoff_stated": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No training data cutoff dates are stated for any of the models used (GPT-4o mini, Gemini 1.5 Flash, Claude 3 Haiku, or any open-source models).",
    303           "source": "opus"
    304         },
    305         "train_test_overlap_discussed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "No discussion of potential train/test overlap. GSM8K, MATH, and other benchmarks have been publicly available for years and could be in the training data of models used.",
    309           "source": "opus"
    310         },
    311         "benchmark_contamination_addressed": {
    312           "applies": true,
    313           "answer": false,
    314           "justification": "Several benchmarks (GSM8K published 2021, MATH 2021) were available before the training of models like GPT-4o mini and Gemini 1.5 Flash. No contamination analysis or discussion is provided.",
    315           "source": "opus"
    316         }
    317       },
    318       "human_studies": {
    319         "pre_registered": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study. All experiments are automated benchmark evaluations.",
    323           "source": "opus"
    324         },
    325         "irb_or_ethics_approval": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "opus"
    330         },
    331         "demographics_reported": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "opus"
    336         },
    337         "inclusion_exclusion_criteria": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "opus"
    342         },
    343         "randomization_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "opus"
    348         },
    349         "blinding_described": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "opus"
    354         },
    355         "attrition_reported": {
    356           "applies": false,
    357           "answer": false,
    358           "justification": "No human participants in this study.",
    359           "source": "opus"
    360         }
    361       },
    362       "cost_and_practicality": {
    363         "inference_cost_reported": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Table 3 reports actual API costs in dollars for each dataset and method. The paper also reports cost reduction percentages (15%-48% across datasets) and average sampling count reductions (34% on average).",
    367           "source": "opus"
    368         },
    369         "compute_budget_stated": {
    370           "applies": true,
    371           "answer": true,
    372           "justification": "Appendix A.1 describes compute resources (Ubuntu 22.04, 8 NVIDIA A100 GPUs, 80GB VRAM each, 1600GB memory). Table 3 provides total API costs. Minimum hardware requirements for different model sizes are also stated.",
    373           "source": "opus"
    374         }
    375       },
    376       "experimental_rigor": {
    377         "seed_sensitivity_reported": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results are reported as single runs with stochastic sampling (temperature=1) but no repeated-seed evaluation.",
    381           "source": "opus"
    382         },
    383         "number_of_runs_stated": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "The number of experimental runs is not explicitly stated. The sampling budget K is stated per query, but whether the entire experiment was repeated multiple times is not specified.",
    387           "source": "opus"
    388         },
    389         "hyperparameter_search_budget": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "External weights Wβ (Tables 1-2) appear tuned per dataset but no search budget, search method, or number of configurations tried is reported.",
    393           "source": "opus"
    394         },
    395         "best_config_selection_justified": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "The Wβ values in Tables 1 and 2 are presented without justification for how they were selected. No validation set selection procedure or search methodology is described.",
    399           "source": "opus"
    400         },
    401         "multiple_comparison_correction": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The paper makes many comparisons across 7 datasets and 6+ methods without any correction for multiple comparisons or even basic significance testing.",
    405           "source": "opus"
    406         },
    407         "self_comparison_bias_addressed": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "The authors implement ModelSwitch and compare against other methods without acknowledging the bias of evaluating their own system. No independent evaluation or discussion of this bias.",
    411           "source": "opus"
    412         },
    413         "compute_budget_vs_performance": {
    414           "applies": true,
    415           "answer": true,
    416           "justification": "Figures 4 and 7 explicitly show performance as a function of sampling budget (1 to 16). Table 3 compares costs at matched budgets. The comparison is always at matched total sampling budget.",
    417           "source": "opus"
    418         },
    419         "benchmark_construct_validity": {
    420           "applies": true,
    421           "answer": false,
    422           "justification": "No discussion of whether the benchmarks actually measure the claimed capabilities. The paper uses GSM8K, MATH, etc. without questioning whether accuracy on these benchmarks reflects the broader 'reasoning and knowledge-based' capabilities claimed.",
    423           "source": "opus"
    424         },
    425         "scaffold_confound_addressed": {
    426           "applies": false,
    427           "answer": false,
    428           "justification": "No scaffolding is involved. The method is a sampling and voting strategy applied directly to model outputs.",
    429           "source": "opus"
    430         }
    431       },
    432       "data_leakage": {
    433         "temporal_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "Not addressed. Several benchmarks (GSM8K, MATH) were published years before the models' training periods, creating potential temporal leakage.",
    437           "source": "opus"
    438         },
    439         "feature_leakage_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "Not addressed. The evaluation uses direct question-answering so feature leakage risk is lower, but the paper does not discuss this.",
    443           "source": "opus"
    444         },
    445         "non_independence_addressed": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "Not addressed. No discussion of whether training data for the evaluated models contains the benchmark questions or structurally similar problems.",
    449           "source": "opus"
    450         },
    451         "leakage_detection_method": {
    452           "applies": true,
    453           "answer": false,
    454           "justification": "No leakage detection or prevention methods are used. No canary strings, membership inference tests, or decontamination pipelines.",
    455           "source": "opus"
    456         }
    457       }
    458     }
    459   },
    460   "claims": [
    461     {
    462       "claim": "Answer consistency (entropy) is universally correlated with accuracy across multiple LLMs and datasets (r=0.61–0.96, p<0.001)",
    463       "evidence": "Figure 2 shows Pearson correlation coefficients between entropy and accuracy for six LLMs on MATH and MathBench, all statistically significant.",
    464       "supported": "strong"
    465     },
    466     {
    467       "claim": "ModelSwitch outperforms self-consistency of each individual LLM on all six main benchmarks when using GPT-4o mini and Gemini 1.5 Flash",
    468       "evidence": "Figure 4 shows ModelSwitch accuracy curves above both individual LLM self-consistency curves across GSM8K, MATH, MathBench, MGSM, DATE, and MMLU-Pro.",
    469       "supported": "strong"
    470     },
    471     {
    472       "claim": "ModelSwitch achieves 34% average sampling reduction while maintaining higher accuracy than single-LLM self-consistency",
    473       "evidence": "Section 4.2 reports per-dataset actual sampling counts (e.g., 9.2 of 16 on GSM8K) averaging to 34% savings; Table 3 confirms corresponding cost reductions.",
    474       "supported": "strong"
    475     },
    476     {
    477       "claim": "ModelSwitch achieves 63.2% on MMLU-Pro, 10.2 points above the best single LLM and above all multi-agent debate methods",
    478       "evidence": "Figure 5 shows ModelSwitch at 63.2% vs. best single LLM at 53% and MOA at 52.6% on MMLU-Pro with 15-sample budget.",
    479       "supported": "strong"
    480     },
    481     {
    482       "claim": "Optimal ModelSwitch performance requires only 2–3 comparable LLMs; adding more can degrade performance",
    483       "evidence": "Figure 6 shows DATE accuracy drops from 78.6% (2 models) to 76.4% (6 models) in strong-to-weak ordering; MathBench plateaus after 2 models.",
    484       "supported": "moderate"
    485     },
    486     {
    487       "claim": "ModelSwitch can be combined with reward models (RM-BoN) to further boost performance beyond voting-only results",
    488       "evidence": "Figure 7 shows ModelSwitch+RM-BoN reaching 84% on MATH vs. 80% for vanilla ModelSwitch and 82% for best-single-LLM+RM-BoN.",
    489       "supported": "moderate"
    490     }
    491   ],
    492   "methodology_tags": [
    493     "benchmark-eval",
    494     "theoretical"
    495   ],
    496   "key_findings": "ModelSwitch leverages the empirically observed correlation between answer consistency (entropy) and accuracy to dynamically switch between multiple LLMs during repeated sampling, achieving better accuracy than single-LLM self-consistency while reducing average sample counts by 34%. On MMLU-Pro, ModelSwitch using two lightweight models (GPT-4o mini + Gemini 1.5 Flash) achieves 63.2% accuracy, outperforming stronger individual models like GPT-4o and all tested multi-agent debate systems by wide margins. The method exhibits a key non-monotonic scaling property: 2–3 comparable LLMs is optimal; adding weaker models beyond this point can hurt performance. A formal theoretical analysis provides sufficient conditions for ModelSwitch to outperform single-model majority voting and bounds expected sampling efficiency gains.",
    497   "red_flags": [
    498     {
    499       "flag": "Math-dominated evaluation",
    500       "detail": "5 of 7 benchmarks are mathematics tasks (GSM8K, MATH, MathBench, MGSM, AIME24). Claims of generalizability to 'various reasoning and knowledge-based tasks' are not adequately supported by this benchmark selection."
    501     },
    502     {
    503       "flag": "No variance or error bars",
    504       "detail": "All results are single accuracy numbers from a single run. Without repeated experiments or confidence intervals, it is impossible to assess whether differences are statistically meaningful, especially for small gaps."
    505     },
    506     {
    507       "flag": "Benchmark contamination unaddressed",
    508       "detail": "None of the tested models' training cutoffs are disclosed and contamination of widely-used benchmarks (GSM8K, MATH) in model pretraining corpora is not discussed, despite being a serious confound for capability comparisons."
    509     },
    510     {
    511       "flag": "No limitations section",
    512       "detail": "The paper contains no dedicated limitations, failure mode, or threats-to-validity discussion. Conditions under which ModelSwitch fails (e.g., when both models are confidently wrong) are not analyzed empirically."
    513     },
    514     {
    515       "flag": "Dataset-specific external weight tuning",
    516       "detail": "Tables 1 and 2 show that external weights Wβ are set differently per dataset (e.g., GPT-4o mini gets weight 2 on MATH but 1 on MMLU-Pro). This per-dataset tuning on the test benchmarks raises overfitting concerns."
    517     },
    518     {
    519       "flag": "No prompts provided",
    520       "detail": "The paper only states that 'COT format' is used but provides no actual prompt text, making exact replication dependent on the GitHub repository."
    521     },
    522     {
    523       "flag": "No alternative switching baseline",
    524       "detail": "There is no comparison to a simple random-switching baseline, making it unclear whether the consistency-based switching criterion specifically is responsible for gains vs. simple multi-model ensemble effects."
    525     }
    526   ],
    527   "cited_papers": [
    528     {
    529       "title": "Self-consistency improves chain of thought reasoning in language models",
    530       "relevance": "Foundational baseline method that ModelSwitch extends; establishes the repeated-sampling-then-voting framework"
    531     },
    532     {
    533       "title": "Large language monkeys: Scaling inference compute with repeated sampling",
    534       "relevance": "Direct prior work on scaling repeated sampling that motivates the efficiency question"
    535     },
    536     {
    537       "title": "Improving factuality and reasoning in language models through multiagent debate",
    538       "relevance": "Primary multi-agent debate baseline (MAD) that ModelSwitch outperforms on all benchmarks"
    539     },
    540     {
    541       "title": "Mixture-of-agents enhances large language model capabilities",
    542       "relevance": "Key multi-LLM baseline (MOA) in comparison; represents hierarchical multi-LLM aggregation approach"
    543     },
    544     {
    545       "title": "AgentVerse: Facilitating multi-agent collaboration and exploring emergent behaviors",
    546       "relevance": "Baseline for multi-agent collaboration with dynamic group composition"
    547     },
    548     {
    549       "title": "ChatEval: Towards better LLM-based evaluators through multi-agent debate",
    550       "relevance": "Multi-agent debate baseline that ModelSwitch is compared against"
    551     },
    552     {
    553       "title": "Scaling LLM test-time compute optimally can be more effective than scaling model parameters",
    554       "relevance": "Contextualizes the test-time compute scaling paradigm that this work contributes to"
    555     },
    556     {
    557       "title": "If multi-agent debate is the answer, what is the question?",
    558       "relevance": "Critical analysis of multi-agent debate limitations that supports ModelSwitch's approach"
    559     }
    560   ],
    561   "engagement_factors": {
    562     "practical_relevance": {
    563       "score": 2,
    564       "justification": "The method is simple to implement (multi-API sampling + voting) and reduces API costs by 34%, making it immediately applicable to practitioners using multiple LLM providers."
    565     },
    566     "surprise_contrarian": {
    567       "score": 1,
    568       "justification": "The finding that two weak models can outperform strong models via simple mixing is mildly surprising but the complementary strengths intuition is not deeply contrarian."
    569     },
    570     "fear_safety": {
    571       "score": 0,
    572       "justification": "No safety, security, or risk implications discussed or relevant."
    573     },
    574     "drama_conflict": {
    575       "score": 0,
    576       "justification": "No controversy or conflict. Standard benchmark comparison paper."
    577     },
    578     "demo_ability": {
    579       "score": 2,
    580       "justification": "Code is released on GitHub. The method is simple enough to implement independently, though it requires multiple API accounts."
    581     },
    582     "brand_recognition": {
    583       "score": 1,
    584       "justification": "Shanghai AI Laboratory has moderate recognition. The paper uses well-known models (GPT-4o, Gemini, Claude) but is not from those labs."
    585     }
    586   },
    587   "hn_data": {
    588     "threads": [
    589       {
    590         "hn_id": "40932006",
    591         "title": "An abundance of Katherines: The game theory of baby naming",
    592         "points": 288,
    593         "comments": 148,
    594         "url": "https://news.ycombinator.com/item?id=40932006"
    595       },
    596       {
    597         "hn_id": "44052041",
    598         "title": "Discord Unveiled: A Comprehensive Dataset of Public Communication (2015-2024)",
    599         "points": 152,
    600         "comments": 179,
    601         "url": "https://news.ycombinator.com/item?id=44052041"
    602       },
    603       {
    604         "hn_id": "43417530",
    605         "title": "Neurosymbolic Decision Trees",
    606         "points": 42,
    607         "comments": 0,
    608         "url": "https://news.ycombinator.com/item?id=43417530"
    609       },
    610       {
    611         "hn_id": "39986540",
    612         "title": "A Survey on Red Teaming for Generative Models",
    613         "points": 16,
    614         "comments": 0,
    615         "url": "https://news.ycombinator.com/item?id=39986540"
    616       },
    617       {
    618         "hn_id": "43986826",
    619         "title": "Bang for the Buck: Vector Search on Cloud CPUs",
    620         "points": 5,
    621         "comments": 0,
    622         "url": "https://news.ycombinator.com/item?id=43986826"
    623       },
    624       {
    625         "hn_id": "31032132",
    626         "title": "A Study of Real-World Data Races in Golang",
    627         "points": 5,
    628         "comments": 0,
    629         "url": "https://news.ycombinator.com/item?id=31032132"
    630       },
    631       {
    632         "hn_id": "43905563",
    633         "title": "(How) Do reasoning models reason?",
    634         "points": 3,
    635         "comments": 0,
    636         "url": "https://news.ycombinator.com/item?id=43905563"
    637       },
    638       {
    639         "hn_id": "46386776",
    640         "title": "LitBench: A Benchmark and Dataset for Reliable Evaluation of Creative Writing",
    641         "points": 3,
    642         "comments": 0,
    643         "url": "https://news.ycombinator.com/item?id=46386776"
    644       },
    645       {
    646         "hn_id": "43751796",
    647         "title": "(How) Do reasoning models reason?",
    648         "points": 2,
    649         "comments": 0,
    650         "url": "https://news.ycombinator.com/item?id=43751796"
    651       },
    652       {
    653         "hn_id": "44179940",
    654         "title": "Stop Anthropomorphizing Intermediate Tokens as Reasoning/Thinking Traces",
    655         "points": 1,
    656         "comments": 0,
    657         "url": "https://news.ycombinator.com/item?id=44179940"
    658       }
    659     ],
    660     "top_points": 288,
    661     "total_points": 517,
    662     "total_comments": 327
    663   }
    664 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs