scan.json (23839B)
1 { 2 "paper": { 3 "title": "Power and Limitations of Aggregation in Compound AI Systems", 4 "authors": [ 5 "Nivasini Ananthakrishnan", 6 "Meena Jagadeesan" 7 ], 8 "year": 2026, 9 "venue": "arXiv", 10 "arxiv_id": "2602.21556" 11 }, 12 "scan_version": 2, 13 "active_modules": [], 14 "methodology_tags": ["theoretical", "case-study"], 15 "key_findings": "This paper provides a theoretical framework characterizing when aggregation in compound AI systems expands the set of elicitable outputs beyond what a single model can produce. Three necessary mechanisms are identified: feasibility expansion, support expansion, and binding set contraction. Strengthened versions of these mechanisms are proven both necessary and sufficient for elicitability-expansion (Theorems 4.3, 4.4). Empirical illustration using GPT-4o-mini on a toy reference-generation task demonstrates all three mechanisms, though the feasibility expansion effect is very small (ℓ1 gap [0.03, 0.15]).", 16 "checklist": { 17 "artifacts": { 18 "code_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "Code is released at https://github.com/nivasini/aggregation_compound_ai, referenced in footnotes on pages 7 and 16." 22 }, 23 "data_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The code repository enables reproduction of all experimental data. The only external dataset used (AlpacaEval helpful-base) is publicly available. Experimental outputs are generated from the released code." 27 }, 28 "environment_specified": { 29 "applies": true, 30 "answer": false, 31 "justification": "The paper specifies the model (gpt-4o-mini-2024-07-18) and embedding model (all-mpnet-base-v2) but provides no requirements.txt, Dockerfile, or detailed dependency specifications." 32 }, 33 "reproduction_instructions": { 34 "applies": true, 35 "answer": false, 36 "justification": "Appendices E and F describe the experimental methodology in detail, but the paper does not provide step-by-step reproduction instructions (no README-style commands or scripts to run)." 37 } 38 }, 39 "statistical_methodology": { 40 "confidence_intervals_or_error_bars": { 41 "applies": true, 42 "answer": true, 43 "justification": "Table 2 reports 95% Wilson confidence intervals for all output vector coordinates and ℓ1 distances. Figure 3 shows shaded confidence sets. Appendix F describes the confidence interval methodology." 44 }, 45 "significance_tests": { 46 "applies": true, 47 "answer": true, 48 "justification": "The paper uses 95% confidence intervals to assess whether ℓ1 distances between aggregated and best single-model outputs are non-zero. For feasibility expansion, they note the CI [0.03, 0.15] 'notably does not contain 0' (Section 5.1), which is functionally equivalent to a significance test." 49 }, 50 "effect_sizes_reported": { 51 "applies": true, 52 "answer": true, 53 "justification": "Table 2d reports ℓ1 distances with confidence intervals: support expansion [0.58, 0.94], feasibility expansion [0.03, 0.15], binding set contraction [0.21, 0.35]. These provide the magnitude of the elicitability-expansion effect." 54 }, 55 "sample_size_justified": { 56 "applies": true, 57 "answer": false, 58 "justification": "The paper states 30 trials were used (Appendix F) but provides no justification for this choice and no power analysis." 59 }, 60 "variance_reported": { 61 "applies": true, 62 "answer": true, 63 "justification": "95% confidence intervals are reported for all quantities in Table 2, encoding variance information. The paper explicitly describes averaging over 30 trials." 64 } 65 }, 66 "evaluation_design": { 67 "baselines_included": { 68 "applies": true, 69 "answer": true, 70 "justification": "The paper compares aggregated outputs x(A) against the best achievable single-model output x∗_P(x(A)), found via brute-force search over all prompt specifications (Section 5). This is the natural baseline." 71 }, 72 "baselines_contemporary": { 73 "applies": true, 74 "answer": true, 75 "justification": "The baseline is GPT-4o-mini without aggregation — the same contemporary model used for the aggregated system." 76 }, 77 "ablation_study": { 78 "applies": false, 79 "answer": false, 80 "justification": "The system being studied (aggregation of prompts) has essentially one component — the aggregation rule. The paper tests different aggregation operations separately, but there are no multi-component systems to ablate." 81 }, 82 "multiple_metrics": { 83 "applies": true, 84 "answer": false, 85 "justification": "The primary evaluation metric is the ℓ1 distance between x(A) and x∗_P(x(A)). Coordinate-wise output vectors are reported but serve as descriptive measurements, not a separate evaluation metric." 86 }, 87 "human_evaluation": { 88 "applies": true, 89 "answer": false, 90 "justification": "No human evaluation is included. The paper relies entirely on LLM-as-a-judge (GPT-4o-mini) to classify paper topics. Human validation of topic classifications could have strengthened the empirical claims." 91 }, 92 "held_out_test_set": { 93 "applies": false, 94 "answer": false, 95 "justification": "No train/test paradigm exists. The paper evaluates LLM prompting behavior, not supervised model performance. The brute-force prompt search is part of the evaluation methodology, not tuning." 96 }, 97 "per_category_breakdown": { 98 "applies": true, 99 "answer": true, 100 "justification": "Results are broken down by all three mechanisms (support expansion, binding set contraction, feasibility expansion) in Table 2 and Figure 3, with per-dimension output vectors for each." 101 }, 102 "failure_cases_discussed": { 103 "applies": true, 104 "answer": true, 105 "justification": "The paper notes that feasibility expansion evidence is 'slightly weaker than for the other two mechanisms' with a very small gap (Section 5.1). Table 1 explicitly shows which mechanisms each aggregation rule cannot implement." 106 }, 107 "negative_results_reported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Table 1 shows fundamental limitations: intersection aggregation cannot implement support expansion (Proposition A.1), addition cannot implement feasibility expansion (Proposition A.2). Propositions A.4 and A.5 show that the natural mechanisms are not sufficient for elicitability-expansion." 111 } 112 }, 113 "claims_and_evidence": { 114 "abstract_claims_supported": { 115 "applies": true, 116 "answer": true, 117 "justification": "All abstract claims are supported: three mechanisms are formalized (Section 3), necessity is proven (Theorem 3.7), full characterization via strengthened mechanisms is proven (Theorems 4.3, 4.4), and empirical illustration is provided (Section 5)." 118 }, 119 "causal_claims_justified": { 120 "applies": true, 121 "answer": true, 122 "justification": "The theoretical claims (aggregation 'expands' elicitability, 'overcomes' limitations) are formally proven within the principal-agent model. The empirical claims are hedged as 'results suggest' and 'empirical illustration' (Section 5.1)." 123 }, 124 "generalization_bounded": { 125 "applies": true, 126 "answer": true, 127 "justification": "The paper repeatedly describes the framework as 'stylized' (Section 1). Section 6 explicitly lists what the model does NOT capture: stochasticity, nonlinear constraints, multi-turn interactions, tool use, fine-tuning. The abstract says 'takes a step towards characterizing' rather than claiming full characterization of real systems." 128 }, 129 "alternative_explanations_discussed": { 130 "applies": true, 131 "answer": true, 132 "justification": "Section 6 discusses model limitations and extensions including randomness, nonlinear limitations, and multi-turn interactions. For the empirical results, the paper shows that using finer-grained prompt topics (output dimension topics vs. prompt topics) can achieve similar effects without aggregation (Section 5.1)." 133 }, 134 "proxy_outcome_distinction": { 135 "applies": true, 136 "answer": true, 137 "justification": "The paper clearly distinguishes between its theoretical notion of elicitability-expansion (Definition 2.4) and the empirical operationalization based on ℓ1 distance closeness. Section 5 states 'we relax these concepts to allow for closeness in output vectors rather than necessarily requiring an exact match.'" 138 } 139 }, 140 "setup_transparency": { 141 "model_versions_specified": { 142 "applies": true, 143 "answer": true, 144 "justification": "The exact model version 'gpt-4o-mini-2024-07-18' is specified in Appendix E and F. The embedding model 'all-mpnet-base-v2' is also specified (Appendix E)." 145 }, 146 "prompts_provided": { 147 "applies": true, 148 "answer": true, 149 "justification": "Full prompt texts are provided in Appendices E and F: five perspective prompts for reference generation, intersection-style and addition-style aggregation instructions, and the LLM-as-a-judge classification prompt. The Section 5 experiment prompts and prompt template are also given in Appendix F." 150 }, 151 "hyperparameters_reported": { 152 "applies": true, 153 "answer": true, 154 "justification": "Temperature is reported: 1.0 for the Section 2.4 embedding experiments (Appendix E), 0 for all Section 5 experiments (Appendix F)." 155 }, 156 "scaffolding_described": { 157 "applies": false, 158 "answer": false, 159 "justification": "No agentic scaffolding is used. The LLM is directly prompted with single-turn queries." 160 }, 161 "data_preprocessing_documented": { 162 "applies": true, 163 "answer": true, 164 "justification": "Appendix F documents preprocessing: title reformatting (lowercase, remove punctuation, remove whitespace), deduplication via substring matching, LLM-as-judge classification procedure, and Wilson confidence interval computation. Appendix E describes the embedding shift to the nonnegative orthant." 165 } 166 }, 167 "limitations_and_scope": { 168 "limitations_section_present": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 6 (Discussion) contains a substantive 'Model limitations and extensions' subsection discussing multiple specific limitations of the framework." 172 }, 173 "threats_to_validity_specific": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 6 discusses specific limitations: deterministic agent assumption (real LLMs are stochastic), linear constraints and features (real systems may have nonlinear limitations), single-round interaction (debate protocols use multi-turn), and agent reward independence (real interactions have interdependencies)." 177 }, 178 "scope_boundaries_stated": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 6 explicitly states what the results do NOT show: 'we assume for simplicity that the agent chooses an output deterministically,' 'we restrict the output constraints and the feature map to linear functional forms,' 'we also assume each agent's reward depends only on its own outputs.' Extensions needed for nonlinear limitations and multi-turn interactions are acknowledged." 182 } 183 }, 184 "data_integrity": { 185 "raw_data_available": { 186 "applies": true, 187 "answer": false, 188 "justification": "The code to regenerate experiments is released on GitHub, but the actual raw LLM outputs from the 30 trials are not explicitly stated to be available. LLM outputs may differ over time even with the same code." 189 }, 190 "data_collection_described": { 191 "applies": true, 192 "answer": true, 193 "justification": "The data collection procedure is well-described: GPT-4o-mini is queried with specific prompts, outputs are classified by LLM-as-judge, output vectors are computed as topic fractions, averaged over 30 trials (Sections 5, E, F)." 194 }, 195 "recruitment_methods_described": { 196 "applies": false, 197 "answer": false, 198 "justification": "No human participants. Data is generated from LLM queries using a standard public dataset (AlpacaEval) for embedding calibration." 199 }, 200 "data_pipeline_documented": { 201 "applies": true, 202 "answer": true, 203 "justification": "The full pipeline is documented in Appendices E and F: prompting → LLM output → title reformatting/deduplication → aggregation → LLM-as-judge classification → output vector computation → confidence interval calculation." 204 } 205 }, 206 "conflicts_of_interest": { 207 "funding_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "Section 7 (Acknowledgments) states: 'This work was partially supported by a Stanford AI Lab postdoctoral fellowship.'" 211 }, 212 "affiliations_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Author affiliations are listed: Nivasini Ananthakrishnan at UC Berkeley, Meena Jagadeesan at Stanford University. Neither author is affiliated with OpenAI (the evaluated model provider)." 216 }, 217 "funder_independent_of_outcome": { 218 "applies": true, 219 "answer": true, 220 "justification": "The Stanford AI Lab postdoctoral fellowship has no financial interest in the outcomes of this study. The paper evaluates OpenAI's GPT-4o-mini, and the funder (Stanford) is independent." 221 }, 222 "financial_interests_declared": { 223 "applies": true, 224 "answer": false, 225 "justification": "No competing interests or financial interest disclosure statement is present in the paper." 226 } 227 }, 228 "contamination": { 229 "training_cutoff_stated": { 230 "applies": false, 231 "answer": false, 232 "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It studies the structural properties of prompting and aggregation in a toy illustration task — not measuring model knowledge." 233 }, 234 "train_test_overlap_discussed": { 235 "applies": false, 236 "answer": false, 237 "justification": "The paper does not evaluate a pre-trained model on a benchmark. The reference-generation task studies prompting/aggregation behavior, not whether the model can solve benchmark problems." 238 }, 239 "benchmark_contamination_addressed": { 240 "applies": false, 241 "answer": false, 242 "justification": "No benchmark evaluation is conducted. The empirical part is a toy illustration of theoretical mechanisms, not a measurement of model capability on a contamination-sensitive task." 243 } 244 }, 245 "human_studies": { 246 "pre_registered": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants in this study." 250 }, 251 "irb_or_ethics_approval": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study." 255 }, 256 "demographics_reported": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in this study." 260 }, 261 "inclusion_exclusion_criteria": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "randomization_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "blinding_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 }, 276 "attrition_reported": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in this study." 280 } 281 }, 282 "cost_and_practicality": { 283 "inference_cost_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "Primarily a theoretical paper. The empirical illustration is a toy task; cost reporting is not relevant to the theoretical contributions." 287 }, 288 "compute_budget_stated": { 289 "applies": false, 290 "answer": false, 291 "justification": "Primarily a theoretical paper. The empirical component is a small-scale illustration, not a computationally intensive experiment." 292 } 293 } 294 }, 295 "claims": [ 296 { 297 "claim": "Any aggregation operation must implement at least one of three mechanisms (feasibility expansion, support expansion, or binding set contraction) to expand the set of elicitable outputs.", 298 "evidence": "Theorem 3.7 (Section 3.2) with formal proof in Section C.1, building on Theorem 4.4.", 299 "supported": "strong" 300 }, 301 { 302 "claim": "Strengthened versions of the three mechanisms provide necessary and sufficient conditions that fully characterize elicitability-expansion.", 303 "evidence": "Theorem 4.3 (sufficiency, proof in Section D.6) and Theorem 4.4 (necessity, proof in Section D.5) establish the full characterization via the power-characterizing condition (Definition 4.2).", 304 "supported": "strong" 305 }, 306 { 307 "claim": "Intersection aggregation cannot implement support expansion for any problem instance, and addition aggregation cannot implement feasibility expansion for any problem instance.", 308 "evidence": "Proposition A.1 and Proposition A.2 (Section A.1, Table 1) provide formal proofs of these impossibility results.", 309 "supported": "strong" 310 }, 311 { 312 "claim": "LLMs (GPT-4o-mini) in a toy reference-generation task empirically demonstrate all three mechanisms, and each mechanism expands elicitability.", 313 "evidence": "Section 5, Figure 3, Table 2. Support expansion: ℓ1 distance [0.58, 0.94]. Binding set contraction: ℓ1 distance [0.21, 0.35]. Feasibility expansion: ℓ1 distance [0.03, 0.15]. All 95% CIs exclude 0.", 314 "supported": "moderate" 315 }, 316 { 317 "claim": "The three natural mechanisms (non-strengthened) are not sufficient for elicitability-expansion.", 318 "evidence": "Propositions A.4 and A.5 (Section A.2) construct counterexamples where support expansion and binding set contraction are implemented but aggregation is not elicitability-expanding.", 319 "supported": "strong" 320 } 321 ], 322 "red_flags": [ 323 { 324 "flag": "Toy empirical illustration", 325 "detail": "The empirical section uses a single model (GPT-4o-mini) on a contrived reference-generation task with hand-picked topics designed to demonstrate each mechanism. The gap between this stylized illustration and real compound AI systems is substantial. The paper acknowledges this but the empirical evidence for the theoretical framework's practical relevance remains thin." 326 }, 327 { 328 "flag": "LLM-as-judge self-evaluation", 329 "detail": "GPT-4o-mini classifies the topics of reference lists that it generated itself. This introduces potential systematic bias in the output vector measurements, since the model's own biases in generation and classification may be correlated." 330 }, 331 { 332 "flag": "Weak feasibility expansion evidence", 333 "detail": "The feasibility expansion experiment has a very small ℓ1 gap ([0.03, 0.15]), which the paper acknowledges is 'slightly weaker' evidence. The confidence interval barely excludes 0, raising questions about whether this mechanism is practically meaningful for LLMs." 334 } 335 ], 336 "cited_papers": [ 337 { 338 "title": "Improving factuality and reasoning in language models through multiagent debate", 339 "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba", "Joshua B. Tenenbaum", "Igor Mordatch"], 340 "year": 2024, 341 "relevance": "Multi-agent LLM debate protocol for improving factuality, directly motivates the aggregation framework studied in this paper." 342 }, 343 { 344 "title": "How we built our multi-agent research system", 345 "authors": ["Anthropic"], 346 "year": 2025, 347 "relevance": "Multi-agent research system where a lead LLM delegates to specialized agents and aggregates outputs — a key motivating application." 348 }, 349 { 350 "title": "Ask me anything: A simple strategy for prompting language models", 351 "authors": ["Simran Arora", "Avanika Narayan", "Mayee F. Chen"], 352 "year": 2022, 353 "arxiv_id": "2210.02441", 354 "relevance": "Prompt ensembling approach that replaces complex single-model prompting with diverse multi-model strategies, directly relevant to aggregation benefits." 355 }, 356 { 357 "title": "Debating with more persuasive LLMs leads to more truthful answers", 358 "authors": ["Akbir Khan", "John Hughes", "Dan Valentine"], 359 "year": 2024, 360 "relevance": "LLM debate where different agents argue for different answers, a form of aggregation studied in this paper." 361 }, 362 { 363 "title": "Self-consistency improves chain of thought reasoning in language models", 364 "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"], 365 "year": 2023, 366 "relevance": "Self-consistency as a form of output aggregation across multiple reasoning traces from the same model." 367 }, 368 { 369 "title": "Adversaries can misuse combinations of safe models", 370 "authors": ["Erik Jones", "Anca Dragan", "Jacob Steinhardt"], 371 "year": 2025, 372 "relevance": "Adversarial model combination to generate unsafe outcomes — explores when aggregation enables capabilities not available from individual models." 373 }, 374 { 375 "title": "Deep reinforcement learning from human preferences", 376 "authors": ["Paul F. Christiano", "Jan Leike", "Tom Brown"], 377 "year": 2017, 378 "relevance": "RLHF framework for training models with human feedback, related to reward function specification in the principal-agent model." 379 }, 380 { 381 "title": "The consensus game: Language model generation via equilibrium search", 382 "authors": ["Athul Paul Jacob", "Yikang Shen", "Gabriele Farina", "Jacob Andreas"], 383 "year": 2023, 384 "arxiv_id": "2310.09139", 385 "relevance": "Consensus between generators and discriminators as a form of multi-agent LLM aggregation." 386 }, 387 { 388 "title": "Training language models to follow instructions with human feedback", 389 "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"], 390 "year": 2022, 391 "relevance": "InstructGPT and RLHF alignment, cited for empirical observation of alignment-hedging side effects relevant to feasibility expansion mechanism." 392 }, 393 { 394 "title": "Is best-of-n the best of them? Coverage, scaling, and optimality in inference-time alignment", 395 "authors": ["Audrey Huang", "Adam Block", "Qinghua Liu"], 396 "year": 2025, 397 "relevance": "Inference-time alignment via best-of-n sampling and coverage properties, directly related to aggregation at test time." 398 }, 399 { 400 "title": "CoT-based synthesizer: Enhancing LLM performance through answer synthesis", 401 "authors": ["Bohan Zhang", "Xiaokang Zhang", "Jing Zhang"], 402 "year": 2025, 403 "relevance": "Answer synthesis as a form of output aggregation across multiple LLM reasoning traces." 404 }, 405 { 406 "title": "How do classifiers induce agents to invest effort strategically?", 407 "authors": ["Jon Kleinberg", "Manish Raghavan"], 408 "year": 2020, 409 "relevance": "The principal-agent framework that this paper extends to multiple agents and aggregation, foundational to the theoretical model." 410 } 411 ] 412 }