scan.json (26760B)
1 { 2 "paper": { 3 "title": "Pyramid MoA: A Probabilistic Framework for Cost-Optimized Anytime Inference", 4 "authors": ["Arindam Khaled"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2602.19509" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "Pyramid MoA uses a lightweight classifier (Random Forest or XGBoost) trained on ensemble agreement features from small models (Llama-3-8B, Mistral-7B, Qwen-2.5-7B) to decide whether to escalate queries to a larger Oracle model (Llama-3-70B). On GSM8K, the system reports 91.4% accuracy at T=0.70 while escalating only 25.5% of queries (vs 79.1% for small models alone). On MBPP, ensemble agreement features outperformed self-reported model confidence for detecting buggy outputs (82.6% recall). The paper claims 93.0% accuracy with 61% cost reduction in the abstract, but only the 91.4%/T=0.70 operating point is detailed in the results section.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper uses publicly available standard benchmarks: GSM8K, MBPP, and HumanEval. These are public datasets that do not require the authors to release them." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No requirements.txt, Dockerfile, conda environment, or environment setup section is provided. Only model names are mentioned without dependency specifications." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "All results are reported as point estimates (93.0% accuracy, 82.6% recall, 91.4% accuracy) with no confidence intervals, error bars, or uncertainty quantification." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper claims Pyramid MoA outperforms baselines (FrugalGPT, Layer 1 alone) based solely on comparing point estimates. No statistical significance tests (p-values, t-tests, bootstrap tests, etc.) are reported." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper reports improvements with baseline context: 91.4% vs 79.1% baseline on GSM8K, 93.0% vs 98.0% Oracle, 3.5x cost reduction, 40% lower inference cost than FrugalGPT on HumanEval. These provide sufficient context to understand effect magnitude." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "The GSM8K test set size (N=1,052) is stated but not justified. The MBPP evaluation has only 23 buggy samples in the confusion matrix with no justification for why this is sufficient for the claims." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "No standard deviations, variance across seeds, or multiple-run results are reported. All figures appear to be single-run point estimates." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper compares against: Layer 1 only (79.1% baseline), Oracle-only (Llama-3-70B at 98.0%), and FrugalGPT cascade (Llama-8B → Llama-70B) on HumanEval (Section 3.2.2, Figure 3)." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "FrugalGPT (2023) is a recent cost-efficiency baseline. The models used (Llama-3, Mistral-7B, Qwen-2.5-7B) are contemporary. Related work discusses recent MoA variants (Sparse MoA, Residual MoA, Self-MoA from 2024-2025)." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": false, 77 "justification": "The system has multiple components (3-model ensemble, router, Oracle layer) but no systematic ablation removing individual components (e.g., testing with 2 vs 3 ensemble models, or removing the router). Feature importance (Figure 2b) shows variable importance within the router but is not a component ablation. Table 1 compares two router algorithms but is a configuration comparison, not ablation." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "The paper reports accuracy, recall (82.6% for bug detection), cost reduction (61%, 3.5x), and latency overhead (+0.82s) across experiments." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": false, 87 "justification": "No human evaluation of system outputs is included. All evaluation is automated (benchmark accuracy, confusion matrices). Human evaluation of code generation quality beyond pass/fail could have been informative." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "Section 3.3.1 explicitly states: 'We evaluated the router on a strictly held-out test set (N = 1,052).' MBPP uses the standard test set." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": false, 97 "justification": "No per-category or per-difficulty breakdowns are provided. GSM8K results are a single aggregate accuracy. No analysis of performance on easy vs. hard problems, which is directly relevant to a routing system that claims to identify 'hard' queries." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": false, 102 "justification": "No qualitative failure examples, error analysis, or discussion of when the router makes wrong escalation decisions (false positives/negatives beyond the confusion matrix counts)." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section 3.1 reports that Random Forest 'lacked the calibration precision required for the continuous, non-linear feature space of mathematical reasoning,' necessitating the switch to XGBoost. This is an explicit negative result about a configuration that did not work." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": false, 114 "justification": "The abstract claims '93.0% accuracy, effectively matching the Oracle baseline (98.0%) while reducing compute costs by 61%.' However, Section 3.3.1 reports 91.4% accuracy at T=0.70 with 25.5% escalation (74.5% short-circuited). The 93.0%/61% operating point is asserted in the abstract and introduction but never shown in the detailed results. Additionally, 93.0% vs 98.0% is a 5pp gap — calling this 'effectively matching' is a stretch." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The main causal claim — that ensemble agreement enables better routing than self-reported confidence — is supported by the feature importance analysis (Figure 2b) showing semantic agreement and output variance outperform self-confidence. The system-level claims (routing reduces cost while maintaining accuracy) are supported by the baseline comparisons." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title claims 'Cost-Optimized Anytime Inference' broadly, and the Discussion says the framework 'democratizes access to high-end reasoning capabilities.' The paper tests on only GSM8K (math) and MBPP/HumanEval (code) with specific open-weight models. The broad framing exceeds the tested settings." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "No alternative explanations are discussed. For example: Could the cost savings be achieved by a simpler confidence threshold on a single model? Could the router's effectiveness be an artifact of the specific benchmark difficulty distribution? No confounds or robustness checks are considered." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper measures accuracy on benchmarks and compute cost, and claims cost-optimized inference. The measurements directly correspond to the claims — no proxy gap exists between what is measured and what is claimed at the empirical level." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "Models are listed as 'Llama-3-8B', 'Mistral-7B', 'Qwen-2.5-7B', 'Llama-3-70B' without specifying instruct vs. base variants, exact version numbers (e.g., Mistral-7B-v0.1 vs v0.3), or snapshot dates. These are model family names, not exact versions." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": false, 146 "justification": "No actual prompt text is provided for how queries are sent to the ensemble models or the Oracle. The paper describes the architecture but never shows the prompts used." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": false, 151 "justification": "The routing threshold T=0.70 is reported, but LLM inference parameters (temperature, top-p, max tokens) are not mentioned. XGBoost and Random Forest hyperparameters are also not reported." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "The system is a model routing/ensemble architecture, not agentic scaffolding. There is no tool use, iterative loops, memory management, or agent-like behavior." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": false, 161 "justification": "No description of how the router's training data was prepared, how features were extracted from model outputs, how train/validation/test splits were created for the router classifier, or how the benchmark data was preprocessed." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": false, 168 "justification": "There is no dedicated limitations or threats-to-validity section. The Discussion & Conclusion (Section 5) is a single short paragraph that mentions future work but does not discuss limitations." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "No threats to validity are discussed anywhere in the paper — neither specific nor generic." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "No explicit statements about what the results do NOT show. The paper mentions 'Future work will explore extending this architecture to other domains such as code generation and legal reasoning' (which contradicts having already tested code generation) but does not state scope boundaries." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "No raw data is available — no model outputs, router predictions, per-example results, or supplementary data files." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": false, 190 "justification": "The benchmarks used are named (GSM8K, MBPP, HumanEval), but the router training data collection is not described: how were ensemble outputs gathered, how were ground-truth labels assigned, what was the training set composition?" 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. Data sources are standard public benchmarks." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": false, 200 "justification": "The pipeline from raw model outputs to ensemble features (semantic agreement, output variance) to router training to final evaluation is not documented. Feature extraction methodology is mentioned conceptually but not specified in reproducible detail." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": false, 206 "answer": false, 207 "justification": "The author is listed as 'Independent Researcher' with a personal email (arindamkhaled@gmail.com). This is clearly unfunded solo work." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "The author's affiliation is disclosed as 'Independent Researcher.' No product from the author's organization is being evaluated — the paper uses open-source models from Meta, Mistral AI, and Alibaba." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": false, 216 "answer": false, 217 "justification": "The work is unfunded (solo independent researcher)." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is included in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "No training data cutoff dates are stated for any of the four models used (Llama-3-8B, Mistral-7B, Qwen-2.5-7B, Llama-3-70B), despite evaluating them on benchmarks." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether benchmark examples (GSM8K, MBPP, HumanEval) appeared in the models' training data." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "GSM8K (2021), MBPP (2021), and HumanEval (2021) all predate the training of Llama-3, Mistral-7B, and Qwen-2.5-7B. These models almost certainly saw these benchmarks during training, yet contamination is not discussed." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": true, 283 "justification": "The paper reports relative cost metrics: 61% cost reduction, 3.5x compute cost reduction at T=0.70, and latency overhead of +0.82s. While absolute costs are not reported, the relative cost analysis is central to the paper's contribution." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "No total computational budget is stated — no GPU hours, hardware specifications, training time for the router classifiers, or total API/compute spend." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The number of experimental runs is never stated. Results are presented without indicating how many runs produced them." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "No hyperparameter search budget is reported for either the XGBoost or Random Forest router classifiers, nor for the threshold selection." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": false, 310 "justification": "Section 3.1 explains the empirical motivation for choosing XGBoost over Random Forest for math reasoning, but does not describe a systematic selection process (e.g., validation set performance comparison, cross-validation)." 311 }, 312 "multiple_comparison_correction": { 313 "applies": true, 314 "answer": false, 315 "justification": "Multiple comparisons are made across benchmarks and configurations but no significance tests are performed at all, let alone corrections for multiple comparisons." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors implement the baselines (FrugalGPT cascade) and the proposed system. No acknowledgment of author-evaluation bias or independent evaluation." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": true, 325 "justification": "Figure 4 shows the Pareto frontier of accuracy vs. computational cost at different routing thresholds, directly presenting performance as a function of compute budget." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "No discussion of whether GSM8K, MBPP, or HumanEval actually measure the capabilities the paper claims to optimize. For example, GSM8K math problems may not represent real-world 'hard reasoning' that would benefit from Oracle escalation." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": false, 334 "answer": false, 335 "justification": "The routing architecture IS the thing being tested. The paper evaluates its routing framework as a bundled system, not making claims about isolated model capabilities." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "GSM8K (2021), MBPP (2021), and HumanEval (2021) all predate the models' training. No discussion of temporal leakage — the models may have seen these benchmark solutions during training." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the evaluation setup leaks information. For example, whether the router's training data and the benchmark evaluation data are properly separated is not addressed." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of independence between the router's training data and the evaluation test set, or between benchmark examples." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No concrete leakage detection or prevention methods are used (no canary strings, membership inference, decontamination, or temporal splits)." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "Pyramid MoA achieves 93.0% accuracy on GSM8K, effectively matching the Oracle baseline (98.0%) while reducing compute costs by 61%.", 364 "evidence": "Abstract and Introduction state these figures. However, Section 3.3.1 reports 91.4% accuracy at T=0.70 with 25.5% escalation. The 93.0%/61% operating point is asserted but not shown in the detailed results section.", 365 "supported": "weak" 366 }, 367 { 368 "claim": "The Consensus Router achieves 82.6% recall on MBPP, intercepting 19 out of 23 buggy code snippets.", 369 "evidence": "Section 3.2.1 and Figure 2a show the confusion matrix. N=23 buggy samples is very small for generalizable claims.", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "Output Length Variance and Semantic Agreement are the dominant predictive features, outperforming self-reported model confidence.", 374 "evidence": "Feature importance analysis in Figure 2b validates this for the Random Forest router on MBPP. Aligns with prior work on model miscalibration [9].", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "The Consensus MoA achieves 40% lower inference cost than FrugalGPT for the same accuracy level on HumanEval.", 379 "evidence": "Figure 3 shows the comparison. However, no variance is reported and the comparison methodology is not detailed.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "At T=0.70, the Anytime Router achieves 91.4% accuracy on GSM8K while escalating only 25.5% of queries, representing a 3.5x cost reduction.", 384 "evidence": "Section 3.3.1 reports these figures from a held-out test set (N=1,052). Single-run result with no variance.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "The system introduces negligible latency overhead (+0.82s).", 389 "evidence": "Stated in the abstract only. No detailed latency measurement methodology or breakdown is presented in the paper body.", 390 "supported": "weak" 391 } 392 ], 393 "red_flags": [ 394 { 395 "flag": "Abstract-body discrepancy", 396 "detail": "The abstract claims 93.0% accuracy and 61% cost reduction on GSM8K, but Section 3.3.1 reports 91.4% accuracy at T=0.70 with 25.5% escalation (74.5% short-circuited, not 61%). The 93.0%/61% operating point is asserted in the abstract and introduction but never documented in the detailed results." 397 }, 398 { 399 "flag": "Overclaiming: 'effectively matching'", 400 "detail": "The abstract says 93.0% 'effectively matches' the Oracle's 98.0%. A 5 percentage point gap (93.0 vs 98.0) is a meaningful difference, not a match." 401 }, 402 { 403 "flag": "No error bars or variance on any result", 404 "detail": "All results are single-run point estimates. For a system that depends on ensemble agreement and a trained classifier, variance across runs and random seeds is critical for interpreting whether differences are meaningful." 405 }, 406 { 407 "flag": "Tiny sample size for code generation claims", 408 "detail": "The MBPP bug detection evaluation is based on only 23 buggy samples (confusion matrix in Figure 2a). The 82.6% recall (19/23) has wide confidence intervals at this sample size." 409 }, 410 { 411 "flag": "Complete absence of contamination analysis", 412 "detail": "All three benchmarks (GSM8K, MBPP, HumanEval) were published in 2021, years before the models' training data was collected. The routing system's performance depends on the base models' accuracy, which may be inflated by contamination." 413 }, 414 { 415 "flag": "Self-contradictory future work", 416 "detail": "Section 5 states 'Future work will explore extending this architecture to other domains such as code generation' despite Experiment I already covering code generation on MBPP/HumanEval." 417 }, 418 { 419 "flag": "Missing reproducibility details", 420 "detail": "No code, no model version specifics (instruct vs base), no hyperparameters for LLM inference or router classifiers, no feature extraction methodology, no router training data description. The paper cannot be reproduced from the information provided." 421 } 422 ], 423 "cited_papers": [ 424 { 425 "title": "Mixture-of-Agents Enhances Large Language Model Capabilities", 426 "authors": ["Junlin Wang", "Jue Wang", "Ben Athiwaratkun", "Ce Zhang", "James Zou"], 427 "year": 2024, 428 "arxiv_id": "2406.04692", 429 "relevance": "Foundational MoA framework showing collaborative LLM layers can outperform individual models; directly extended by Pyramid MoA." 430 }, 431 { 432 "title": "Rethinking Mixture-of-Agents: Is Mixing Different Large Language Models Beneficial?", 433 "authors": ["Wenzhe Li", "Yong Lin", "Mengzhou Xia", "Chi Jin"], 434 "year": 2025, 435 "arxiv_id": "2502.00674", 436 "relevance": "Challenges MoA diversity assumptions by showing Self-MoA (single-model iteration) can match multi-model ensembles." 437 }, 438 { 439 "title": "Residual Mixture of Agents", 440 "authors": ["Zhentao Xie", "Chengcheng Han", "Jinxin Shi", "Wenjun Cui", "Xin Zhao", "Xingjiao Wu", "Jiabao Zhao"], 441 "year": 2025, 442 "relevance": "Introduces adaptive termination and residual connections in MoA to reduce compute, achieving gains on MATH benchmark." 443 }, 444 { 445 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 446 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 447 "year": 2023, 448 "arxiv_id": "2305.05176", 449 "relevance": "Key baseline for cost-efficient LLM usage via cascading strategies; Pyramid MoA directly compares against this approach." 450 }, 451 { 452 "title": "On Calibration of Modern Neural Networks", 453 "authors": ["Chuan Guo", "Geoff Pleiss", "Yu Sun", "Kilian Q. Weinberger"], 454 "year": 2017, 455 "relevance": "Documents model miscalibration (confident but wrong), motivating the paper's use of external agreement signals rather than self-reported confidence." 456 }, 457 { 458 "title": "BranchyNet: Fast Inference via Early Exiting from Deep Neural Networks", 459 "authors": ["Surat Teerapittayanon", "Bradley McDanel", "H.T. Kung"], 460 "year": 2016, 461 "relevance": "Early-exit strategy for deep networks that inspired the anytime/early-termination approach in Pyramid MoA." 462 }, 463 { 464 "title": "Sparse MoA", 465 "authors": ["Giang Do", "Hung Le", "Truyen Tran"], 466 "year": 2024, 467 "relevance": "Introduces sparsity-based early stopping in MoA architectures to reduce computational overhead." 468 } 469 ] 470 }