scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27801B)
      1 {
      2   "paper": {
      3     "title": "Don't Always Pick the Highest-Performing Model: An Information Theoretic View of LLM Ensemble Selection",
      4     "authors": [
      5       "Yigit Turkmen",
      6       "Baturalp Buyukates",
      7       "Melih Bastopcu"
      8     ],
      9     "year": 2026,
     10     "venue": "arXiv",
     11     "arxiv_id": "2602.08003"
     12   },
     13   "scan_version": 2,
     14   "active_modules": ["experimental_rigor", "data_leakage"],
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. Algorithms are described in pseudocode (Algorithms 1-6) but no executable code is released."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper uses publicly available datasets: MEDMCQA (Pal et al., 2022), MMLU (Hendrycks et al., 2021), and IMDB movie reviews (Maas et al., 2011). All are standard public benchmarks."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No requirements.txt, Dockerfile, conda environment, or library version information is provided. The paper does not specify software dependencies."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No README, reproduction scripts, or step-by-step instructions are provided. While pseudocode algorithms are given, there are no runnable instructions for replicating experiments."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Tables 4-6, 8-10, 12-14 report mean ± std dev across 30 evaluations. Figures include shaded standard deviation regions (e.g., 'Shaded region represents the standard deviation')."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper claims 'consistently outperforms strong baselines' but uses no statistical significance tests (no p-values, t-tests, bootstrap tests, or any other test). Improvements are compared by raw numbers only."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Error probabilities are reported with baseline context, e.g., 'achieving its best performance of 16.3% error at k=5, compared to 17.0% for Top-k selection' (Section 6.1). Tables provide absolute values enabling comparison."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No justification for dataset sizes: 4183 MEDMCQA samples, 5000 randomly selected MMLU samples, 10000 IMDB samples. No power analysis or discussion of whether these sizes are adequate for the claimed effect sizes."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Standard deviation is reported across 30 evaluations (3 temperatures × 2 runs × 5 random 80/20 splits) in all summary tables. E.g., Table 4: '0.163 ± 0.008'."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Three baselines are compared: Top-k (accuracy-based selection), Term 1 Only (relevance only), and Terms 1+2 (mRMR-style). Additionally, three aggregation rules are compared (MAP, MV, W-MV)."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The baselines represent the principal selection strategies in the literature. The mRMR baseline derives from Peng et al. (2005), the dominant paradigm. LLM-TOPLA (Tekin et al., 2024) and MUSE (Kruse et al., 2025) are discussed as related work."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The decomposition into Term 1 (accuracy), Terms 1+2 (mRMR), and full Greedy MI (all three terms) constitutes an ablation, testing the contribution of each component of the information gain decomposition (Theorem 4.3)."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "Only test error probability is reported. No other metrics (e.g., computational efficiency of selection, calibration, coverage, or area under budget-performance curve) are used."
     85       },
     86       "human_evaluation": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "Human evaluation is irrelevant to this paper's claims about ensemble selection algorithms for binary classification tasks."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The paper uses an 80/20 random split: '80% for estimating the required information-theoretic quantities... 20% for evaluation' (Section 6.1). This is repeated across 5 independent random splits."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Results are broken down by dataset (MEDMCQA, MMLU, IMDB), by temperature setting (0.01, 0.3, 0.7), by run, and by ensemble size k. Per-condition curves appear in Appendices F-H."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section 6.3 discusses IMDB where 'improvements from ensemble selection are modest' due to high correlation (ρ̄=0.90). Performance saturation at large k is discussed. Section 8 acknowledges binary setting limitation."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Several negative results: IMDB shows minimal gains; at large k all methods degrade; Appendix F.2 notes 'baselines with WE aggregation occasionally outperform Greedy MI with MAP for large ensemble sizes'; the saturation theorem itself is a negative result."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Abstract claims 'consistently outperforms strong baselines under the same query budget' — results in Figures 5, 6 and Tables 4, 8, 12 show this for MEDMCQA and MMLU in mid-range k. IMDB gains are modest but present. The saturation floor claim is supported by Theorem 4.4."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The paper makes causal claims ('correlation introduces additional structure,' 'mRMR's aggressive diversity-seeking has forced it to include several weak models, degrading overall performance'). These are justified through formal theorems (4.1, 4.3, 4.4) and the ablation structure (Terms 1 vs 1+2 vs full)."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Section 8 explicitly bounds scope: 'Our study focuses on a binary decision setting' and acknowledges 'extending these insights to richer output spaces and alternative dependency structures presents a promising direction.' The title and abstract don't overclaim beyond what's shown."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Section 8 discusses alternative explanations: MAP estimation difficulty at large k, Gaussian-copula model limitations, the role of training dataset size for MI estimation. Appendix F.2 discusses aggregation rule interaction effects."
    132       },
    133       "proxy_outcome_distinction": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "The paper measures test error probability and claims this measures ensemble classification accuracy. The measurement matches the claim directly — no proxy gap exists. The binary conversion procedure is transparent."
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "Models are listed by marketing names only: 'GPT-4.1', 'GPT-5-chat', 'Gemini-2.5-flash', 'Claude-3.5-haiku', etc. No API versions, snapshot dates, or specific model IDs are provided. These are insufficient per the criterion."
    144       },
    145       "prompts_provided": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Full prompt templates are provided in Appendices F, G, H with exact text. E.g., MEDMCQA: 'System: Is the following statement true or false? Answer with a single word: True or False.' With concrete examples including fill values."
    149       },
    150       "hyperparameters_reported": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Temperature settings explicitly stated: 0.01, 0.3, and 0.7 (Section 6.1). Laplace smoothing α=1 documented in Algorithm 2 (Appendix E.1). Two independent runs per temperature setting."
    154       },
    155       "scaffolding_described": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "No agentic scaffolding is used. The paper sends single prompts to LLM APIs and collects binary predictions."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Binary conversion procedure is documented in detail (Section 6.1): positive queries append ground-truth answer, negative queries append a randomly selected incorrect answer. MMLU uses 5000 randomly selected samples from 14042. IMDB sampling described in Appendix H."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 8 'Limitations and Discussion' provides substantive discussion of the binary setting limitation, Gaussian-copula model assumptions, and saturation effects."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 8 identifies specific threats: binary decision setting may not extend to richer output spaces, Gaussian-copula may not capture all dependency structures, and saturation effects limit improvements. These are specific to this study."
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 8 states: 'Our study focuses on a binary decision setting, which allows for a clean and interpretable information-theoretic analysis and serves as a foundational step toward more general formulations.' Explicitly scopes to binary classification."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The raw model outputs (binary predictions from 12-13 LLMs on thousands of examples) are not released. Only aggregated results are shown. Independent verification of the correlation matrices and error patterns is not possible."
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Data collection is described: models queried through OpenRouter (Appendix F), specific datasets and splits identified, temperature settings specified, binary conversion procedure documented."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": false,
    196         "answer": false,
    197         "justification": "No human participants. Data sources are standard public benchmarks (MEDMCQA, MMLU, IMDB)."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Pipeline documented: original multi-choice questions → binary conversion (2 queries per question) → LLM inference at specified temperatures → 80/20 random splits × 5 folds → MI estimation on train → evaluation on test."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Funding disclosed on page 1: 'This work was supported by Tubitak 2232-B program (Project No:124C533).'"
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Author affiliations clearly listed: Bilkent University (Turkmen, Bastopcu) and University of Birmingham (Buyukates). No affiliation with any evaluated model provider."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Tubitak is the Scientific and Technological Research Council of Turkey, a government research funding agency with no financial interest in which ensemble selection method performs best."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No competing interests or financial interests statement is included in the paper."
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No training data cutoff dates are stated for any of the 12-13 LLMs evaluated. MEDMCQA (2022) and MMLU (2021) are old benchmarks likely in most models' training data."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No discussion of whether MEDMCQA or MMLU examples appeared in the training data of GPT-4.1, GPT-5-chat, Qwen3-235b, or other evaluated models. This could affect the error correlation structure central to the paper's claims."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "MEDMCQA (2022) and MMLU (2021) were published well before the training of the 2025-2026 models used. No contamination analysis is performed despite this being a significant concern for the validity of the observed correlation structures."
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "demographics_reported": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "blinding_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "attrition_reported": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No API costs, token counts, or latency figures reported. The paper queries 12-13 LLMs via OpenRouter across thousands of examples at three temperatures with two runs each, but total cost is not stated."
    286       },
    287       "compute_budget_stated": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No total computational budget (API spend, wall-clock time, or hardware) is reported despite extensive experimentation across multiple models, temperatures, and datasets."
    291       }
    292     },
    293     "experimental_rigor": {
    294       "seed_sensitivity_reported": {
    295         "applies": true,
    296         "answer": true,
    297         "justification": "Results are reported across 2 independent runs per temperature and 5 random data splits. Standard deviations capture variability. Tables report 'averaged over 30 evaluations (3 temperatures × 2 runs × 5 folds).'"
    298       },
    299       "number_of_runs_stated": {
    300         "applies": true,
    301         "answer": true,
    302         "justification": "Explicitly stated: '3 temperature settings (0.01, 0.3, and 0.7), performing two independent runs at each temperature' (Section 6.1) and '5 independent random splits' per run."
    303       },
    304       "hyperparameter_search_budget": {
    305         "applies": false,
    306         "answer": false,
    307         "justification": "The Greedy MI method is essentially parameter-free (greedy selection with standard Laplace smoothing α=1). Temperatures are experimental conditions, not hyperparameters of the selection method."
    308       },
    309       "best_config_selection_justified": {
    310         "applies": true,
    311         "answer": true,
    312         "justification": "Results are reported across all configurations (all k values, temperatures, runs, splits). No cherry-picking of best configuration — Figures 5-6 show full curves, Tables 4-14 show all settings."
    313       },
    314       "multiple_comparison_correction": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "Four methods are compared across up to 13 ensemble sizes, 3 temperatures, 2 runs, and 3 aggregation rules. No multiple comparison correction (Bonferroni, Holm, etc.) is applied."
    318       },
    319       "self_comparison_bias_addressed": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The authors implement all baselines (Top-k, Term 1, mRMR) themselves and compare against their own method. No acknowledgment of author-evaluation bias or independent evaluation."
    323       },
    324       "compute_budget_vs_performance": {
    325         "applies": true,
    326         "answer": true,
    327         "justification": "The entire paper is structured around comparing methods at matched query budgets (k). Figures 5, 6 plot performance as a function of ensemble size k, which directly corresponds to compute cost."
    328       },
    329       "benchmark_construct_validity": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "No discussion of whether MEDMCQA, MMLU, and IMDB actually measure the capabilities relevant to ensemble selection claims. The artificial binary conversion (appending answers and asking true/false) creates a different task than the original benchmarks — this validity concern is not addressed."
    333       },
    334       "scaffold_confound_addressed": {
    335         "applies": false,
    336         "answer": false,
    337         "justification": "No scaffolding is involved. All models receive the same simple prompt with no agentic components."
    338       }
    339     },
    340     "data_leakage": {
    341       "temporal_leakage_addressed": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "MEDMCQA (2022) and MMLU (2021) were published years before the 2025-2026 models used. Models very likely trained on these benchmarks. No temporal leakage analysis is performed."
    345       },
    346       "feature_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "The binary conversion format ('Is the answer X?') could systematically differ from how models encountered these questions in training. This potential feature leakage through format transformation is not discussed."
    350       },
    351       "non_independence_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "Each multi-choice question generates one positive and one negative query that are structurally dependent (same question stem, correct vs incorrect answer). This non-independence in the evaluation set is not addressed."
    355       },
    356       "leakage_detection_method": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No leakage detection or prevention methods are used (no canary strings, membership inference, temporal splits, or decontamination)."
    360       }
    361     }
    362   },
    363   "claims": [
    364     {
    365       "claim": "When LLM decisions are independent, the optimal ensemble selection is trivially selecting the most accurate models (Top-k).",
    366       "evidence": "Theorem 4.1 with formal proof in Appendix A, using binary symmetric channel (BSC) framework and stochastic degradation argument.",
    367       "supported": "strong"
    368     },
    369     {
    370       "claim": "Under uniform pairwise correlation, the error probability converges to a non-zero floor even with infinite models, given by Φ(Φ⁻¹(1−α)/√ρ).",
    371       "evidence": "Theorem 4.4 with formal proof in Appendix C using Gaussian-copula one-factor model and strong law of large numbers.",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "The marginal information gain decomposes into accuracy, prediction redundancy, error correlation, and a label-dependence correction term, showing mRMR misses the error correlation component.",
    376       "evidence": "Theorem 4.3 with formal proof in Appendix B, demonstrating that standard mRMR criteria omit the I(Ej; ES) term.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Greedy MI consistently outperforms strong baselines under identical query budgets across multiple datasets.",
    381       "evidence": "Figures 5-6, Tables 4, 8, 12. MEDMCQA: 16.3% vs 17.0% at k=5; MMLU: 14.1% vs 14.9% at k=6. Averaged over 30 evaluations with std dev.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "The Gaussian-copula model effectively captures both pairwise and higher-order error dependencies of real LLM ensembles.",
    386       "evidence": "Figures 4, 10-11, 15-16, 20-21 show visual agreement between copula-predicted and empirical error distributions. Validation across all temperature/run settings.",
    387       "supported": "moderate"
    388     }
    389   ],
    390   "methodology_tags": ["theoretical", "benchmark-eval"],
    391   "key_findings": "The paper provides an information-theoretic framework for LLM ensemble selection, proving that Top-k accuracy selection is optimal only under independent errors (Theorem 4.1) and that correlated errors create an irreducible error floor (Theorem 4.4). The Greedy MI algorithm, motivated by a novel accuracy-redundancy-error decomposition (Theorem 4.3), achieves modest but consistent improvements over baselines in the practical mid-budget range (k=3-7) on MEDMCQA (~0.7pp) and MMLU (~0.8pp), with negligible gains on IMDB where correlations are very high (ρ̄=0.90).",
    392   "red_flags": [
    393     {
    394       "flag": "No significance testing",
    395       "detail": "Claims of 'consistently outperforms' are based on raw number comparisons. Improvements are small (0.7-1.0 percentage points) and within the reported standard deviations. No statistical significance tests are performed to verify these differences are meaningful."
    396     },
    397     {
    398       "flag": "Benchmark contamination unaddressed",
    399       "detail": "MEDMCQA (2022) and MMLU (2021) are likely in the training data of 2025-2026 models (GPT-5-chat, GPT-4.1, Qwen3-235b, etc.). Contamination could systematically alter the error correlation structure that is central to the paper's theoretical and empirical claims."
    400     },
    401     {
    402       "flag": "Artificial binary conversion",
    403       "detail": "Multi-choice questions are converted to binary true/false by appending candidate answers. This creates non-independent positive/negative query pairs from the same question and transforms the task into something different from what the benchmarks were designed to measure. The validity of this conversion is not assessed."
    404     },
    405     {
    406       "flag": "Modest effect sizes",
    407       "detail": "Improvements over Top-k are ~0.7pp on MEDMCQA and ~0.8pp on MMLU. On IMDB, gains are negligible. These improvements, while consistent in direction, are small relative to the standard deviations reported and may not be practically significant."
    408     }
    409   ],
    410   "cited_papers": [
    411     {
    412       "title": "Why do multi-agent LLM systems fail?",
    413       "authors": ["Cemri, M.", "Pan, M. Z.", "Yang, S."],
    414       "year": 2025,
    415       "arxiv_id": "2503.13657",
    416       "relevance": "Systematic failure analysis of multi-agent LLM systems, identifies inter-agent misalignment as dominant failure mode — directly relevant to understanding correlated LLM errors."
    417     },
    418     {
    419       "title": "Towards a science of scaling agent systems",
    420       "authors": ["Kim, Y.", "Gu, K.", "Park, C."],
    421       "year": 2025,
    422       "arxiv_id": "2512.08296",
    423       "relevance": "Studies scaling laws for multi-agent systems, observes diminishing returns from coordination when single-agent accuracy exceeds ~45% — supports the saturation phenomenon studied here."
    424     },
    425     {
    426       "title": "Harnessing multiple large language models: A survey on LLM ensemble",
    427       "authors": ["Chen, Z.", "Li, J.", "Chen, P."],
    428       "year": 2025,
    429       "arxiv_id": "2502.18036",
    430       "relevance": "Survey of LLM ensemble methods covering voting, fusion, and selection strategies — comprehensive overview of the field this paper contributes to."
    431     },
    432     {
    433       "title": "Self-consistency improves chain of thought reasoning in language models",
    434       "authors": ["Wang, X.", "Wei, J.", "Schuurmans, D."],
    435       "year": 2023,
    436       "arxiv_id": "2203.11171",
    437       "relevance": "Foundational work on self-consistency through majority voting for single-model sampling, extended to multi-model settings by subsequent work."
    438     },
    439     {
    440       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    441       "authors": ["Chen, L.", "Zaharia, M.", "Zou, J."],
    442       "year": 2024,
    443       "relevance": "Addresses cost-effective LLM usage through cascaded selection and model routing — related to the budgeted ensemble selection problem."
    444     },
    445     {
    446       "title": "LLM-Blender: ensembling large language models with pairwise comparison and generative fusion",
    447       "authors": ["Jiang, D.", "Ren, X.", "Lin, B. Y."],
    448       "year": 2023,
    449       "relevance": "LLM ensemble method using pairwise ranking and generative fusion for combining outputs from multiple LLMs."
    450     },
    451     {
    452       "title": "Beyond majority voting: LLM aggregation by leveraging higher-order information",
    453       "authors": ["Ai, R.", "Pan, Y.", "Simchi-Levi, D."],
    454       "year": 2025,
    455       "arxiv_id": "2510.01499",
    456       "relevance": "Uses second-order statistics for optimal weight aggregation in LLM ensembles — complementary approach to the selection problem addressed here."
    457     },
    458     {
    459       "title": "LLM-TOPLA: Efficient LLM ensemble by maximising diversity",
    460       "authors": ["Tekin, S. F.", "Ilhan, F.", "Huang, T."],
    461       "year": 2024,
    462       "relevance": "Introduces focal diversity metrics for LLM ensemble pruning — alternative diversity-based approach to ensemble selection."
    463     },
    464     {
    465       "title": "Simple yet effective: An information-theoretic approach to multi-LLM uncertainty quantification",
    466       "authors": ["Kruse, M.", "Afshar, M.", "Khatwani, S."],
    467       "year": 2025,
    468       "relevance": "Applies Jensen-Shannon divergence for selecting well-calibrated LLM subsets — information-theoretic approach to LLM ensemble uncertainty."
    469     },
    470     {
    471       "title": "Cost-aware LLM-based online dataset annotation",
    472       "authors": ["Elumar, E. C.", "Tekin, C.", "Yagan, O."],
    473       "year": 2025,
    474       "relevance": "Uses Gaussian-copula sampler to model dependent correctness for Monte Carlo confidence estimation in LLM annotation — directly related copula modeling approach."
    475     },
    476     {
    477       "title": "Large language models: A survey",
    478       "authors": ["Minaee, S.", "Mikolov, T.", "Nikzad, N."],
    479       "year": 2025,
    480       "arxiv_id": "2402.06196",
    481       "relevance": "Comprehensive survey of LLM capabilities and applications, providing context for why LLM ensembling is needed."
    482     }
    483   ]
    484 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs