scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (18924B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "theoretical",
      4   "paper": {
      5     "title": "Don't Always Pick the Highest-Performing Model: An Information Theoretic View of LLM Ensemble Selection",
      6     "authors": [
      7       "Yigit Turkmen",
      8       "Baturalp Buyukates",
      9       "Melih Bastopcu"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv",
     13     "arxiv_id": "2602.08003",
     14     "doi": null
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All abstract claims — Gaussian-copula modeling of correlated errors, information-theoretic error floor (Theorem 4.4), greedy MI algorithm, and consistent outperformance over baselines — are substantiated by proofs and experiments across three datasets.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Performance improvement claims are supported by controlled comparisons with identical query budgets across three benchmarks, three temperature settings, and five random splits per run; the mechanism is also explained theoretically via Theorem 4.3.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Section 8 explicitly bounds generalization to binary decision settings; Theorem 4.4 is conditioned on equicorrelated Gaussian structure; empirical results note limited gains in the high-correlation IMDB regime (ρ=0.90).",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper explains why mRMR fails (penalizes structured error correlation it should exploit, Section 4.2), why performance degrades at large k (MAP estimator's exponential 2^k pattern space), and why IMDB gains are modest (near-uniform high correlation).",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper directly measures test error probability, which is exactly the quantity claimed to be minimized; no proxy substitution occurs.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 8 'Limitations and Discussion' is a dedicated section addressing the binary decision setting restriction, Gaussian-copula model scope, and saturation effects.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats are identified: binary classification restriction, MAP estimator degradation at large k due to sparse pattern estimation over 2^k outcomes, and IMDB results showing limited gains in near-uniform high-correlation regimes (ρ=0.90).",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper explicitly states it focuses on binary decision settings (Section 8), Theorem 4.4 is conditioned on equicorrelated ensembles, and the contribution is framed as a 'foundational step' rather than a general solution.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Funding is disclosed in the footnote: 'This work was supported by Tubitak 2232-B program (Project No:124C533).'",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are clearly stated: Bilkent University, Ankara (Turkmen and Bastopcu) and University of Birmingham (Buyukates).",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Tubitak is the Turkish government scientific research council, independent of any LLM vendor or commercial interest evaluated in the paper.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement (patents, equity, consulting) is present in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms are formally defined: budgeted ensemble selection problem (Section 3.3, Equation 4), Gaussian-copula error model (Section 3.1), MAP estimator (Equation 3), mutual information gain (Equation 7), and error indicator variable.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 1 lists five explicit bullet-point contributions including Theorem 4.1 (independence optimality), Theorem 4.3 (MI decomposition), Theorem 4.4 (saturation limit), the Greedy MI algorithm, and empirical evaluation.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 explicitly engages with mRMR (showing why it doesn't transfer via Theorem 4.3), LLM-TOPLA, MUSE, self-consistency, and FrugalGPT, explaining this work's distinction as selection-focused rather than aggregation-focused.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "theoretical": {
    118       "formal_quality": {
    119         "assumptions_stated_explicitly": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "All assumptions are explicitly stated: balanced prior P(Y=±1)=0.5 (Section 3), independent BSC errors for Theorem 4.1, label-invariant error assumption for simplified Theorem 4.3, and equicorrelated Gaussian (uniform ρ) for Theorem 4.4.",
    123           "source": "haiku"
    124         },
    125         "proofs_complete_or_sketched": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "All four main theorems (4.1, 4.3, 4.4, D.1) have complete step-by-step proofs in Appendices A–D, with supporting definitions and lemmas (BSC degradation, chain rule, entropy invariance under bijection).",
    129           "source": "haiku"
    130         },
    131         "bounds_tight_or_discussed": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "Remark A.5 explicitly discusses tightness of the Theorem 4.1 bound (equality when S is exactly the k-smallest-error channels); Theorem 4.4's saturation floor is tight under uniform correlation; Remark D.2 gives a (1−1/e) approximation guarantee for the greedy approach.",
    135           "source": "haiku"
    136         },
    137         "counterexamples_explored": {
    138           "applies": true,
    139           "answer": true,
    140           "justification": "Figure 2 gives a concrete counterexample where Top-k fails (four GPT models fail together at 81% avg accuracy while a diverse 72% ensemble succeeds); Example A.1 in Appendix A illustrates stochastic degradation; IMDB explores the limiting case of near-uniform high correlation.",
    141           "source": "haiku"
    142         },
    143         "notation_consistent": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Notation is consistent throughout (Xj for predictions, Ej for errors, Y for label, S for subsets, ρ for correlation); the dual use of α for Laplace smoothing and accuracy is explicitly flagged in Algorithm 2 with a parenthetical note.",
    147           "source": "haiku"
    148         },
    149         "constructive_vs_existence_noted": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "Theorem 4.1 proof is explicitly constructive (explicit bijection and coupling construction); Theorem 4.4 provides a closed-form computable formula; Algorithm 1 gives a constructive greedy procedure that can be directly implemented.",
    153           "source": "haiku"
    154         }
    155       },
    156       "connections": {
    157         "connection_to_practice_discussed": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "The paper explicitly targets the 'practical budget constraint' regime (k=3–7), provides complete implementation details (Algorithms 1–6 with complexity analysis), evaluates on real LLM API calls across three benchmarks, and discusses deployment cost/latency tradeoffs.",
    161           "source": "haiku"
    162         },
    163         "relationship_to_prior_work_clear": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Section 4.2 and Theorem 4.3 explicitly show why mRMR does not transfer to ensemble selection (additional I(Ej;ES) error correlation term); Section 2 positions against LLM-TOPLA, MUSE, self-consistency, and FrugalGPT with clear distinctions.",
    167           "source": "haiku"
    168         },
    169         "computational_complexity_discussed": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Appendix E provides explicit complexity analysis: MI estimation O(N + KAKB), MAP aggregation O((Ntr+Nte)k + 2^k); the exponential 2^k MAP term is identified as the reason for performance degradation at large k.",
    173           "source": "haiku"
    174         },
    175         "limitations_of_formal_model_stated": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Section 8 explicitly states the Gaussian-copula model may not capture all dependency structures, binary classification is a simplification, and the uniform pairwise correlation assumption in Theorem 4.4 is an idealization of the full model.",
    179           "source": "haiku"
    180         }
    181       }
    182     }
    183   },
    184   "claims": [
    185     {
    186       "claim": "When LLM errors are independent, the optimal ensemble selects the k most accurate models (Top-k is optimal in both MI and error probability).",
    187       "evidence": "Theorem 4.1 proves this via stochastic degradation and the data processing inequality; the proof is constructive, establishing an explicit Markov chain Y→XHk→XS for any competing subset S.",
    188       "supported": "strong"
    189     },
    190     {
    191       "claim": "Under correlated LLM errors (Gaussian-copula with uniform ρ), there exists a fundamental non-vanishing error floor as ensemble size grows to infinity.",
    192       "evidence": "Theorem 4.4 derives the closed-form limit lim P(error) = Φ(Φ^{-1}(1−α)/√ρ) > 0 for any ρ > 0, α > 1/2; validated empirically by performance plateaus in Figures 5–6.",
    193       "supported": "strong"
    194     },
    195     {
    196       "claim": "Greedy MI selection consistently outperforms Top-k and mRMR-style selection under identical query budgets.",
    197       "evidence": "MEDMCQA: best error 16.3% vs. 17.0% for Top-k at k=5; MMLU: 14.1% vs. 14.9% at k=6; improvements hold across 30 evaluations (3 temperatures × 2 runs × 5 folds) per dataset.",
    198       "supported": "strong"
    199     },
    200     {
    201       "claim": "The mRMR feature selection principle does not transfer to LLM ensemble selection.",
    202       "evidence": "Theorem 4.3 shows marginal information gain has an additional I(Ej;ES) term absent from mRMR; empirically, mRMR (Terms 1+2) reaches 0.264 error at k=2 under majority voting on MEDMCQA vs. 0.171 for Greedy MI.",
    203       "supported": "strong"
    204     },
    205     {
    206       "claim": "Gaussian-copula accurately models real LLM error dependencies, including higher-order simultaneous error distributions.",
    207       "evidence": "Pairwise scatter plots (Figures 4, 10, 15, 20) show tight diagonal alignment; simultaneous error histograms (Figures 11, 16, 21) match copula predictions; validated across 3 datasets and 6 temperature-run conditions.",
    208       "supported": "moderate"
    209     },
    210     {
    211       "claim": "Correlated errors from same model families explain Top-k's failure; cross-family diversity with maintained accuracy is the remedy.",
    212       "evidence": "Tables 1–2 show Greedy MI selects models from OpenAI, Qwen, Moonshot, Google with moderate cross-family correlations (ρ≈0.4–0.5) vs. Top-k stacking multiple OpenAI models with high within-family correlations (ρ≈0.7–0.8).",
    213       "supported": "moderate"
    214     }
    215   ],
    216   "methodology_tags": [
    217     "theoretical",
    218     "benchmark-eval"
    219   ],
    220   "key_findings": "The paper provides a rigorous information-theoretic analysis of LLM ensemble selection under query budgets. The central theoretical result (Theorem 4.1) proves Top-k accuracy selection is optimal only when errors are independent — its failure in practice arises entirely from correlation structure. Theorem 4.4 establishes an explicit, unavoidable performance floor under correlated ensembles: lim P(error) = Φ(Φ^{-1}(1−α)/√ρ) > 0, meaning scaling ensemble size cannot overcome shared latent difficulty. The proposed Greedy MI algorithm, motivated by a novel Accuracy-Redundancy-Error decomposition (Theorem 4.3), consistently outperforms Top-k and mRMR-style selection in the practical mid-budget regime (k=3–7) across MEDMCQA and MMLU, while gains are limited on IMDB (ρ=0.90) consistent with the saturation theorem.",
    221   "red_flags": [
    222     {
    223       "flag": "Binary classification restriction",
    224       "detail": "All theoretical results and empirical evaluations are restricted to binary (true/false) outputs; applicability to multi-class or open-ended generation tasks — far more common in real LLM deployments — is undemonstrated and likely requires significant theoretical extension."
    225     },
    226     {
    227       "flag": "MAP estimator conflates selection and aggregation quality at large k",
    228       "detail": "At large k, all methods degrade due to MAP estimator's exponential 2^k pattern space; this makes it impossible to isolate whether performance differences at large k reflect selection quality or estimator limitations, limiting the validity of large-k comparisons."
    229     },
    230     {
    231       "flag": "Balanced prior assumption throughout",
    232       "detail": "The Theorem 4.4 derivation and experimental binary conversion both assume P(Y=+1)=P(Y=−1)=0.5; the MEDMCQA conversion creates artificial balance by pairing each question with exactly one correct/incorrect answer, which may not reflect natural query distributions."
    233     },
    234     {
    235       "flag": "No competing interests declaration",
    236       "detail": "The paper does not include a competing interests or financial interests statement despite evaluating commercial models (GPT-5, Claude, Gemini) through a commercial API aggregator (OpenRouter)."
    237     }
    238   ],
    239   "cited_papers": [
    240     {
    241       "title": "Why do multi-agent LLM systems fail?",
    242       "relevance": "Identifies inter-agent misalignment and correlated errors as dominant multi-agent failure modes, directly motivating the ensemble correlation problem studied here."
    243     },
    244     {
    245       "title": "Towards a science of scaling agent systems",
    246       "relevance": "Documents diminishing/negative returns from LLM coordination above ~45% single-agent accuracy, consistent with the correlation-induced saturation theorem."
    247     },
    248     {
    249       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    250       "relevance": "Addresses cost-performance tradeoffs via cascaded LLM selection, a closely related approach to budgeted ensemble selection."
    251     },
    252     {
    253       "title": "Feature selection based on mutual information criteria of max-dependency, max-relevance, and min-redundancy (mRMR)",
    254       "relevance": "The mRMR criterion is the primary baseline the paper formally shows does not transfer to ensemble selection due to the additional error correlation structure."
    255     },
    256     {
    257       "title": "Self-consistency improves chain of thought reasoning in language models",
    258       "relevance": "Popularized majority voting for single-model sampling; extended to multi-model ensembling as one of the baseline aggregation methods evaluated."
    259     },
    260     {
    261       "title": "LLM-TOPLA: Efficient LLM ensemble by maximising diversity",
    262       "relevance": "Introduces focal diversity metrics for ensemble pruning — a competing diversity-based approach to the greedy MI selection proposed here."
    263     },
    264     {
    265       "title": "Simple yet effective: An information-theoretic approach to multi-LLM uncertainty quantification (MUSE)",
    266       "relevance": "Applies Jensen-Shannon divergence to select well-calibrated LLM subsets — a related but distinct information-theoretic ensemble selection approach."
    267     },
    268     {
    269       "title": "Conditional likelihood maximisation: A unifying framework for information theoretic feature selection",
    270       "relevance": "Unifies mRMR variants under a common framework; the paper extends this by identifying why these criteria fail for ensemble selection (missing I(Ej;ES) term)."
    271     }
    272   ],
    273   "engagement_factors": {
    274     "practical_relevance": {
    275       "score": 2,
    276       "justification": "Practitioners building multi-LLM pipelines can directly apply the greedy MI algorithm with a labeled calibration set, though the binary classification restriction limits immediate deployment in most real-world generative use cases."
    277     },
    278     "surprise_contrarian": {
    279       "score": 3,
    280       "justification": "The title and core result directly challenge the intuitive 'pick the best model' heuristic with a formal proof, showing that accuracy alone is suboptimal and that moderately-accurate diverse models can outperform high-accuracy correlated ones."
    281     },
    282     "fear_safety": {
    283       "score": 0,
    284       "justification": "The paper does not address safety, alignment, or risk concerns; it is a technical optimization paper on ensemble selection."
    285     },
    286     "drama_conflict": {
    287       "score": 1,
    288       "justification": "The paper challenges the common Top-k heuristic and shows mRMR fails, but there is no major ongoing controversy or heated debate being adjudicated."
    289     },
    290     "demo_ability": {
    291       "score": 1,
    292       "justification": "The algorithm is implementable with API access to multiple LLMs and a labeled evaluation set, but the multi-model API costs and binary classification constraint create significant friction for casual demonstration."
    293     },
    294     "brand_recognition": {
    295       "score": 0,
    296       "justification": "Authors are from Bilkent University and University of Birmingham — academic institutions without strong LLM brand recognition; no famous lab or product affiliation."
    297     }
    298   },
    299   "hn_data": {
    300     "threads": [
    301       {
    302         "hn_id": "47370450",
    303         "title": "End-to-End Hardware-Driven Graph Preprocessing for Enhanced GNN Performance",
    304         "points": 5,
    305         "comments": 0,
    306         "url": "https://news.ycombinator.com/item?id=47370450",
    307         "created_at": "2026-03-13T21:51:18Z"
    308       }
    309     ],
    310     "top_points": 5,
    311     "total_points": 5,
    312     "total_comments": 0
    313   }
    314 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs