ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (17760B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "theoretical",
      4   "paper": {
      5     "title": "Don't Always Pick the Highest-Performing Model: An Information Theoretic View of LLM Ensemble Selection",
      6     "authors": [
      7       "Yigit Turkmen",
      8       "Baturalp Buyukates",
      9       "Melih Bastopcu"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv",
     13     "arxiv_id": "2602.08003",
     14     "doi": null
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims 'consistently outperforms strong baselines under the same query budget' — results in Figures 5, 6 and Tables 4, 8, 12 show this for MEDMCQA and MMLU in mid-range k. IMDB gains are modest but present. The saturation floor claim is supported by Theorem 4.4.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper makes causal claims ('correlation introduces additional structure,' 'mRMR's aggressive diversity-seeking has forced it to include several weak models, degrading overall performance'). These are justified through formal theorems (4.1, 4.3, 4.4) and the ablation structure (Terms 1 vs 1+2 vs full).",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Section 8 explicitly bounds scope: 'Our study focuses on a binary decision setting' and acknowledges 'extending these insights to richer output spaces and alternative dependency structures presents a promising direction.' The title and abstract don't overclaim beyond what's shown.",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Section 8 discusses alternative explanations: MAP estimation difficulty at large k, Gaussian-copula model limitations, the role of training dataset size for MI estimation. Appendix F.2 discusses aggregation rule interaction effects.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper measures test error probability and claims this measures ensemble classification accuracy. The measurement matches the claim directly — no proxy gap exists. The binary conversion procedure is transparent.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 8 'Limitations and Discussion' provides substantive discussion of the binary setting limitation, Gaussian-copula model assumptions, and saturation effects.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section 8 identifies specific threats: binary decision setting may not extend to richer output spaces, Gaussian-copula may not capture all dependency structures, and saturation effects limit improvements. These are specific to this study.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Section 8 states: 'Our study focuses on a binary decision setting, which allows for a clean and interpretable information-theoretic analysis and serves as a foundational step toward more general formulations.' Explicitly scopes to binary classification.",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Funding disclosed on page 1: 'This work was supported by Tubitak 2232-B program (Project No:124C533).'",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations clearly listed: Bilkent University (Turkmen, Bastopcu) and University of Birmingham (Buyukates). No affiliation with any evaluated model provider.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Tubitak is the Scientific and Technological Research Council of Turkey, a government research funding agency with no financial interest in which ensemble selection method performs best.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement is included in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms are defined precisely: 'budgeted ensemble selection' (Section 3.3, Eq. 4), 'Gaussian-copula' (Section 3.1), 'MAP estimator' (Section 3.2, Eq. 3), 'mutual information gain' (Section 4.2, Eq. 7).",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "A five-bullet 'Contributions' paragraph in Section 1 explicitly lists each claim: Gaussian-copula representation, independence optimality theorem, greedy MI algorithm, saturation floor theorem, and empirical validation.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 explicitly contrasts this work with mRMR (Peng 2005) and shows the intuition does not transfer; connects to LLM ensemble literature (Kim 2025, Cemri 2025, Jiang 2023) and Gaussian-copula literature (Li 2000, Pan 2025).",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "theoretical": {
    118       "formal_quality": {
    119         "assumptions_stated_explicitly": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Assumptions are stated explicitly: balanced prior P(Y=±1)=0.5 (Section 3), ε<0.5 for all models (Theorem 4.1), uniform pairwise correlation ρ>0 (Theorem 4.4), and the label-invariant error assumption (E1,...,Em)⊥Y (Theorem 4.3 and Corollary B.5).",
    123           "source": "haiku"
    124         },
    125         "proofs_complete_or_sketched": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "All four theorems (4.1, 4.3, 4.4, D.1) have complete proofs in Appendices A–D, not just sketches; lemmas and corollaries are also proved.",
    129           "source": "haiku"
    130         },
    131         "bounds_tight_or_discussed": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "Remark A.5 explicitly discusses tightness of Theorem 4.1 (equalities hold when subset IS Top-k, strict inequality otherwise); Remark C.1 discusses edge cases as ρ→0 and ρ→1 for the saturation limit.",
    135           "source": "haiku"
    136         },
    137         "counterexamples_explored": {
    138           "applies": true,
    139           "answer": true,
    140           "justification": "Figure 2 provides a concrete counterexample showing a 72%-average diverse ensemble outperforming an 81%-average correlated GPT ensemble; Example A.1 in Appendix shows stochastic degradation with m=3 models.",
    141           "source": "haiku"
    142         },
    143         "notation_consistent": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Notation is defined once and used consistently: Y for true label, Xj for predictions, Ej for error indicators, S for subsets, ρ for correlation, τj for thresholds throughout all sections and appendices.",
    147           "source": "haiku"
    148         },
    149         "constructive_vs_existence_noted": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "The paper explicitly notes that finding S*k is NP-hard (combinatorial) and therefore proposes the greedy MI algorithm as a tractable constructive approximation; the saturation floor (Theorem 4.4) is computable given α and ρ.",
    153           "source": "haiku"
    154         }
    155       },
    156       "connections": {
    157         "connection_to_practice_discussed": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Extensive empirical evaluation on three practical QA/classification tasks with 12-13 current frontier LLMs; Section E provides computational complexity analysis; gains are discussed in terms of practical query budgets (k=3–7).",
    161           "source": "haiku"
    162         },
    163         "relationship_to_prior_work_clear": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Section 4.2 explicitly shows Theorem 4.3 generalizes mRMR (Peng 2005) with an additional I(Ej;ES) term; Section 2 positions this work as complementary to aggregation-focused work (Jiang 2023, Yang 2025b) by addressing selection instead.",
    167           "source": "haiku"
    168         },
    169         "computational_complexity_discussed": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Appendix E provides explicit complexity analysis for all algorithms: MI estimation O(N+KaKb), MAP aggregation O((Ntr+Nte)k+2^k), and acknowledges the exponential growth in k as a practical limitation.",
    173           "source": "haiku"
    174         },
    175         "limitations_of_formal_model_stated": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Section 8 states the Gaussian-copula model is used for 'key correlation patterns' but does not fully capture richer dependency structures; the balanced prior and binary output space are identified as model limitations.",
    179           "source": "haiku"
    180         }
    181       }
    182     }
    183   },
    184   "claims": [
    185     {
    186       "claim": "When LLM errors are independent, selecting the top-k most accurate models is simultaneously optimal for mutual information and error probability.",
    187       "evidence": "Theorem 4.1 with complete proof in Appendix A using stochastic degradation and data processing inequality.",
    188       "supported": "strong"
    189     },
    190     {
    191       "claim": "Correlated LLM ensembles have a non-vanishing error floor even with infinite models and optimal aggregation.",
    192       "evidence": "Theorem 4.4 with complete proof in Appendix C; error floor = Φ(Φ⁻¹(1-α)/√ρ) under equicorrelated Gaussian-copula.",
    193       "supported": "strong"
    194     },
    195     {
    196       "claim": "Greedy MI outperforms Top-k accuracy selection across all tested datasets under identical query budgets.",
    197       "evidence": "Figures 5, 6, 17 and Tables 4, 8, 12 showing consistent improvement over 30 evaluations (3 temps × 2 runs × 5 splits); best gain: 16.3% vs 17.0% at k=5 on MEDMCQA.",
    198       "supported": "strong"
    199     },
    200     {
    201       "claim": "mRMR-style feature selection does not directly transfer to ensemble selection because it ignores the I(Ej;ES) error correlation term.",
    202       "evidence": "Theorem 4.3 (Accuracy-Redundancy-Error Decomposition) and Table 1/2 showing mRMR (Terms 1+2) selects weak models aggressively.",
    203       "supported": "strong"
    204     },
    205     {
    206       "claim": "Gaussian-copula accurately models LLM error dependence structure including higher-order simultaneous failures.",
    207       "evidence": "Figures 4, 10, 11, 15, 16 showing close fit of copula to empirical error distributions across all datasets and temperature settings.",
    208       "supported": "moderate"
    209     },
    210     {
    211       "claim": "Within-family model correlations (ρ≈0.7–0.8) are substantially higher than cross-family correlations (ρ≈0.4–0.5) on MEDMCQA.",
    212       "evidence": "Figure 22 correlation matrix and discussion in Section 6.1; however, this observation drives the selection narrative without formal hypothesis testing.",
    213       "supported": "moderate"
    214     }
    215   ],
    216   "methodology_tags": [
    217     "theoretical",
    218     "benchmark-eval"
    219   ],
    220   "key_findings": "The paper proves that Top-k accuracy selection is optimal only when LLM errors are independent, and shows through Theorem 4.3 that correlation introduces an additional error-correlation term I(Ej;ES) that mRMR-style methods miss. Under uniform pairwise correlation, Theorem 4.4 establishes a fundamental, non-vanishing error floor Φ(Φ⁻¹(1-α)/√ρ) that cannot be reduced by adding more models. The proposed Greedy MI algorithm, which iteratively selects models maximizing marginal mutual information gain, consistently outperforms Top-k and mRMR baselines in the practical budget range k=3–7, with a ~0.7pp error reduction on MEDMCQA and ~1pp on MMLU; gains are minimal on IMDB where ρ̄=0.90 keeps the ensemble near the theoretical floor.",
    221   "red_flags": [
    222     {
    223       "flag": "Modest empirical gains",
    224       "detail": "The best improvement over Top-k is 0.7% absolute error (16.3% vs 17.0% on MEDMCQA), which is within the reported standard deviations; statistical significance tests (p-values) are not reported."
    225     },
    226     {
    227       "flag": "Binary classification only",
    228       "detail": "All multi-class benchmarks (MEDMCQA, MMLU) are artificially converted to binary True/False queries, which may not reflect how practitioners actually deploy LLM ensembles."
    229     },
    230     {
    231       "flag": "MAP estimator requires 2^k parameters",
    232       "detail": "The MAP aggregator estimates P(Y|XS) over 2^k patterns, which degrades for k>8 due to data sparsity — the paper shows performance declines at large k, but this is partly an artifact of the aggregation choice rather than selection alone."
    233     },
    234     {
    235       "flag": "No competing interests statement",
    236       "detail": "The paper uses frontier models from OpenAI, Anthropic, Google, Mistral, and others as experimental subjects without declaring any potential financial interests in their relative performance."
    237     },
    238     {
    239       "flag": "Copula fit by construction at second order",
    240       "detail": "The Gaussian-copula is fit by matching pairwise marginals (Eq. 14), so agreement in Figure 4a (scatter plot) is partly circular; higher-order agreement (Figure 4b histograms) is the meaningful validation."
    241     }
    242   ],
    243   "cited_papers": [
    244     {
    245       "title": "Feature selection based on mutual information criteria of max-dependency, max-relevance, and min-redundancy",
    246       "relevance": "Foundational mRMR criterion that the paper extends and shows is insufficient for ensemble selection due to the missing error-correlation term."
    247     },
    248     {
    249       "title": "Self-consistency improves chain of thought reasoning in language models",
    250       "relevance": "Popularized majority voting for single-model sampling; starting point for multi-model ensemble methods."
    251     },
    252     {
    253       "title": "Towards a science of scaling agent systems",
    254       "relevance": "Empirical evidence for diminishing or negative returns from adding more agents, motivating the paper's saturation theorems."
    255     },
    256     {
    257       "title": "Why do multi-agent LLM systems fail?",
    258       "relevance": "Identifies inter-agent misalignment as a failure mode, supporting the paper's focus on error correlation structure."
    259     },
    260     {
    261       "title": "LLM-Blender: ensembling large language models with pairwise comparison and generative fusion",
    262       "relevance": "Representative aggregation-focused ensemble work that this paper complements by addressing selection rather than fusion."
    263     },
    264     {
    265       "title": "Simple yet effective: An information-theoretic approach to multi-LLM uncertainty quantification",
    266       "relevance": "Closely related contemporaneous work using Jensen-Shannon divergence for ensemble subset selection."
    267     },
    268     {
    269       "title": "An Introduction to Copulas",
    270       "relevance": "Foundational reference for the Gaussian-copula statistical framework used to model LLM error dependence."
    271     },
    272     {
    273       "title": "Conditional likelihood maximisation: A unifying framework for information theoretic feature selection",
    274       "relevance": "Unifies mRMR variants under conditional mutual information; paper shows these do not transfer to ensemble selection."
    275     }
    276   ],
    277   "engagement_factors": {
    278     "practical_relevance": {
    279       "score": 2,
    280       "justification": "Directly actionable for teams running LLM inference under budget constraints — the greedy algorithm is simple to implement and tested on real frontier models."
    281     },
    282     "surprise_contrarian": {
    283       "score": 3,
    284       "justification": "The title explicitly contradicts the dominant intuition; Theorem 4.1 proves Top-k is only optimal under independence, directly challenging standard practice."
    285     },
    286     "fear_safety": {
    287       "score": 0,
    288       "justification": "No safety or risk angle; the paper improves reliability but does not raise concern about AI harm."
    289     },
    290     "drama_conflict": {
    291       "score": 1,
    292       "justification": "Mild controversy in showing mRMR (a well-known method) actively hurts ensemble performance in this setting."
    293     },
    294     "demo_ability": {
    295       "score": 1,
    296       "justification": "Algorithm is fully described and implementable but no code is released; replication requires API access to 12+ frontier models."
    297     },
    298     "brand_recognition": {
    299       "score": 0,
    300       "justification": "Authors are from Bilkent University and University of Birmingham, not major AI lab brands; Tubitak funding is not recognizable to the HN audience."
    301     }
    302   },
    303   "hn_data": {
    304     "threads": [
    305       {
    306         "hn_id": "47370450",
    307         "title": "End-to-End Hardware-Driven Graph Preprocessing for Enhanced GNN Performance",
    308         "points": 5,
    309         "comments": 0,
    310         "url": "https://news.ycombinator.com/item?id=47370450",
    311         "created_at": "2026-03-13T21:51:18Z"
    312       }
    313     ],
    314     "top_points": 5,
    315     "total_points": 5,
    316     "total_comments": 0
    317   }
    318 }

Impressum · Datenschutz