ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25192B)


      1 {
      2   "paper": {
      3     "title": "C3PO: Optimized Large Language Model Cascades with Probabilistic Cost Constraints for Reasoning",
      4     "authors": [
      5       "Antonios Valkanas",
      6       "Soumyasundar Pal",
      7       "Pavel Rumiantsev",
      8       "Yingxue Zhang",
      9       "Mark Coates"
     10     ],
     11     "year": 2025,
     12     "venue": "NeurIPS 2025",
     13     "arxiv_id": "2511.07396"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "A GitHub repository is provided: https://github.com/AntonValk/C3PO-LLM (Section 5.1 footnote 2 and Appendix A)."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "All 16 datasets used are publicly available under open-source licenses (Apache 2.0, MIT, CC0), as noted in Section 5.1 footnote 1. The paper uses standard public benchmarks (GSM8K, MATH-500, SVAMP, AQuA, AIME, CommonSenseQA, BIG-Bench-Hard)."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No requirements.txt, Dockerfile, or detailed environment/dependency listing is provided in the paper. The paper mentions using commercial API services (Nebius, OpenAI) but does not specify library versions or a reproducible environment setup."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "While code is released and the methodology is described, the paper does not include step-by-step reproduction instructions, specific commands to run, or a README-style guide to replicate the main experiments."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Figure 2 and other accuracy-vs-cost plots include error bars denoting 90% confidence intervals, as stated in the figure caption: 'Error bars denote 90% confidence interval.'"
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper claims C3PO 'outperforms' baselines across benchmarks but no formal statistical significance tests (p-values, t-tests, etc.) are reported to support these comparative claims."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper reports concrete effect sizes with baseline context, e.g., 'C3PO achieves 62.5% accuracy with cost of 0.0019 USD/question, whereas SC using MPM manages to obtain 57% accuracy at a cost of 0.0053 USD/question' (Section 5.4). Figures show accuracy at specific cost points enabling comparison."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 4.3 provides a formal PAC-Bayesian analysis (Theorem 2) that relates the number of self-supervision samples NSS to generalization error bounds, with a concrete example showing that NSS=150 gives bounded excess regret. Appendix E provides the minimal detectable change analysis."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The plots in Figure 2 and the appendix figures show 90% confidence interval error bars across experimental runs. The boxplots in Figures 1, 8, 10, 12 show distributions across datasets with whiskers extending to 90% coverage."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Section 5.3 describes four baselines: FrugalGPT, Mixture of Thoughts (MoT), TREACLE, and ModelSwitch. SC-CoT (self-consistency with the most powerful model) is also included as a reference point."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The baselines are recent: FrugalGPT (2024), MoT (2024), TREACLE (2024), ModelSwitch (2025a). These are contemporary methods representing the state of the art in LLM cascading."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper includes several ablation-style analyses: distribution shift experiments (Section 5.4, Fig. 5), cross-family model experiments (Section 5.4, Fig. 4 right), and per-difficulty-level analysis (Section 5.4, Fig. 3). The conformal guarantee verification (300 runs) also serves as an ablation across configurations."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The paper reports both accuracy and inference cost (USD per question) as primary metrics. The cost violation rate is also tracked as a separate metric for evaluating the conformal guarantee."
     85       },
     86       "human_evaluation": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "This is a benchmark evaluation paper comparing cascade algorithms on automated reasoning tasks with ground-truth answers. Human evaluation is not relevant to the claims."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Section 4 explicitly describes partitioning data into DSS (self-supervision) and DCal (calibration) sets, with test evaluation on a disjoint held-out test set: 'we calibrate C3PO using a held-out calibration set and then evaluate it on a disjoint test set' (Section 5.4)."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Results are reported per-dataset across all 16 benchmarks (Figures 7, 9, 11). Additionally, per-difficulty breakdowns are provided for MATH-500 (Figure 3, Figure 14), and per-question-level analysis is in Appendix I."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The case study in Appendix I.1 shows specific failure patterns (LLaMA 1B and 3B failing on a geometry problem). Appendix I.2 includes a 'very bad' category analysis where C3PO fails at higher cost than baselines. Section 5.4 also notes that o3-mini performs worse than GPT-4o-mini on some datasets."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper reports that the conformal cost guarantee 'can lead to overly conservative cascade decisions leading the model to under-utilize the cost budget sometimes' (Section 6, Limitations). The GPT cascade analysis in Section G.1 notes that o3-mini degrades accuracy on some datasets compared to cheaper models."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims (1) state-of-the-art performance across reasoning benchmarks, (2) conformal prediction bounds on cost, (3) generalization guarantees, and (4) label-free operation. All are supported: empirical results in Section 5.4 and Appendix G, theoretical guarantees in Section 4.3, and the self-supervised framework in Section 4."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The paper makes causal claims via ablation-style experiments (e.g., removing components of the cascade, varying budgets, cross-family evaluation). The core claim that C3PO's threshold optimization causes cost savings is supported by controlled comparisons where all methods receive identical prompts, seeds, and data splits (Section 5.4)."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper explicitly bounds its claims to the tested reasoning benchmarks and model families (LLAMA, QWEN, GPT). The title and abstract specify 'reasoning' tasks. Distribution shift experiments (Section 5.4, Fig. 5) test generalization across domains but acknowledge it is limited to math reasoning."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper does not substantively discuss alternative explanations for its empirical results. For instance, it does not consider whether C3PO's advantage could be partly due to favorable confidence score properties of the specific models tested, or whether the improvements are driven primarily by the conformal calibration vs. the threshold optimization. The limitations section in Section 6 mentions conservative cost usage but does not discuss alternative explanations for the observed results."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Section 5.2 specifies exact model versions: Llama 3.2-1B-Instruct, Llama 3.2-3B-Instruct, Llama 3.3-70B-Instruct, Llama-3.1-405B-Instruct, Qwen2.5-1.5B-Instruct, Qwen2.5-32B-Instruct, Qwen2.5-72B-Instruct, GPT 3.5 Turbo, GPT-4o-mini, and o3-mini."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Appendix H provides full prompt templates with actual few-shot examples for both free-form (MATH-500) and multiple-choice (CommonSenseQA) formats, including the complete worked examples used as demonstrations."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Key hyperparameters are reported: 8-shot prompting, 5 CoT samples per model, training set of 100 reasoning problems, grid resolution K < 10 (Section 4.2), alpha values of 0.05 and 0.1 (Section 5.4), 5 budget levels tested."
    149       },
    150       "scaffolding_described": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The cascade scaffolding is described in detail: Section 3 defines the cascade decision rule with confidence thresholds, Section 4 describes the grid search optimization, conformal calibration procedure, and Algorithm 1 provides pseudocode. The paper does not use agentic scaffolding in the autonomous-agent sense but fully describes its multi-model orchestration."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 4 describes the data partitioning into DSS and DCal splits. Section 5.4 specifies that 100 reasoning problems are used for training, with fixed seeds ensuring identical data for all methods. Appendix F documents the cost model preprocessing."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 6 (Conclusion) contains a dedicated 'Limitations' subsection. Appendix A provides an extended 'Broader Impact, Limitations and Code' section with detailed discussion of limitations, risks, and misuse."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The paper identifies specific threats: (1) the conformal cost guarantee can lead to overly conservative decisions (Section 6), (2) cost reduction may be applied without verifying model correctness in critical domains (Appendix A), (3) token pricing changes over time (Appendix F, with a defense that ratios remain consistent)."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "While the paper restricts experiments to reasoning benchmarks and specific model families, it does not explicitly state what the results do NOT show. The abstract claims 'scalable LLM deployment' broadly without bounding this to reasoning tasks. The paper does not explicitly exclude applicability to generation, summarization, or other LLM tasks."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "All datasets used are publicly available standard benchmarks (GSM8K, MATH-500, SVAMP, AQuA, AIME, CommonSenseQA, BIG-Bench-Hard) with open-source licenses. Code is released, enabling reproduction of raw experimental data."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 5.1 describes all datasets and their reasoning categories. Section 5.4 describes how 100 training questions are sampled per dataset with fixed seeds, and how 5 CoT samples are generated per model per question. Appendix F documents the cost collection procedure."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants are involved. All data comes from standard public benchmarks."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 4 documents the full pipeline: data split into DSS and DCal, grid search over thresholds on DSS, quantile check on DCal, and evaluation on held-out test set. Section 5.4 specifies that seeds are fixed so all methods receive identical data."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The Acknowledgments section discloses funding from NSERC (reference number 260250), NSERC PGS-D program, Stavros S. Niarchos Foundation Fellowship, Vadasz Doctoral Fellowship, and Fonds de recherche du Québec (project 324302)."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Author affiliations are clearly listed: McGill University, Mila - Quebec AI Institute, Int. Lab. Learning Systems, and Huawei. The Huawei affiliation is relevant since commercial API models are evaluated."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Primary funding is from NSERC (a Canadian government research council) and academic fellowships, which have no financial stake in the outcomes. The Huawei affiliation could present a minor conflict but the paper evaluates multiple model families (LLAMA, QWEN, GPT) without favoring any particular one."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests or financial disclosure statement is present in the paper. Two authors are affiliated with Huawei, which has a commercial interest in efficient LLM inference, but no explicit conflicts of interest statement is provided."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The paper does not state the training data cutoff dates for any of the models used (LLAMA, QWEN, GPT). This is relevant because some benchmarks like GSM8K (2021) and MATH (2021) could be in the training data of newer models."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No analysis of potential train/test overlap is provided. Given that benchmarks like GSM8K and MATH-500 have been publicly available since 2021 and the models were trained after that, contamination is a real concern that is not addressed."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "Several benchmarks (GSM8K, MATH-500, CommonSenseQA, BIG-Bench-Hard) were published well before the training cutoffs of the models used (Llama 3.x, Qwen 2.5, GPT-4o-mini, o3-mini). The paper does not discuss contamination risk despite this being a known issue for these benchmarks."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this study."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved in this study."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in this study."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in this study."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in this study."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved in this study."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants are involved in this study."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": true,
    280         "justification": "Inference cost is a central metric of the paper. USD cost per question is reported extensively across all experiments (Figures 2, 7, 9, 11, boxplots). Per-token costs are documented in Appendix F Tables 2-4."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": true,
    285         "justification": "The paper reports that grid search for optimal thresholds takes approximately 0.01 seconds on an M3 CPU for 4 LLMs with 10 grid points and 50 questions (Section 4.2). API inference costs are documented with per-token pricing. Experiments were conducted during April-May 2025 (Appendix F)."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "C3PO achieves state-of-the-art performance across 16 diverse reasoning benchmarks, outperforming strong LLM cascading baselines in both accuracy and cost-efficiency.",
    292       "evidence": "Figures 2, 7, 9, 11 show C3PO's accuracy-vs-cost Pareto curves dominating baselines (FrugalGPT, MoT, TREACLE, ModelSwitch) across LLAMA, QWEN, and GPT cascades on all 16 datasets.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "C3PO requires less than 20% of the MPM cost for an accuracy gap of at most 2% using a LLAMA cascade.",
    297       "evidence": "Figure 1 (right) and Figure 8 show boxplots across 16 datasets demonstrating this claim. The whiskers extend to 90% coverage.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "C3PO's conformal cost guarantee holds empirically: only 1 violation in 300 runs across 15 datasets, 2 cascades, 5 budget levels, and 2 alpha values.",
    302       "evidence": "Section 5.4 reports this result directly: 'We observe only a single violation in 300 runs of C3PO.'",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "C3PO learns cascade decision rules using fewer than 1% of the examples used by TREACLE.",
    307       "evidence": "Section 1 states this claim. Section 5.4 confirms C3PO uses 100 training examples. However, the exact number used by TREACLE is not explicitly stated in the paper for comparison.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "C3PO is robust to cross-family model cascades (using models from different LLM families).",
    312       "evidence": "Section 5.4 and Figure 4 (right) show a mixed cascade (LLaMA 1B, Qwen 32B, GPT-4o-mini) performing comparably to single-family cascades.",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "C3PO demonstrates excellent distribution shift robustness compared to baselines.",
    317       "evidence": "Section 5.4 and Figures 5, 13 show training on GSM8K/SVAMP and testing on MATH-500, where C3PO outperforms FrugalGPT and TREACLE.",
    318       "supported": "moderate"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "benchmark-eval",
    323     "theoretical"
    324   ],
    325   "key_findings": "C3PO is a self-supervised framework for optimizing LLM cascades under probabilistic cost constraints, using conformal prediction to bound inference cost and PAC-Bayesian analysis for generalization guarantees. Across 16 reasoning benchmarks with three model families (LLAMA, QWEN, GPT), C3PO achieves near-MPM accuracy at a fraction of the cost (less than 20% for LLAMA cascades), outperforming supervised (FrugalGPT, TREACLE) and unsupervised (MoT, ModelSwitch) baselines while requiring no labeled data. The conformal cost guarantee is empirically validated with only 1 violation in 300 experimental configurations.",
    326   "red_flags": [
    327     {
    328       "flag": "Benchmark contamination not addressed",
    329       "detail": "Several benchmarks used (GSM8K, MATH-500, CommonSenseQA, BIG-Bench-Hard) were published years before the training cutoffs of the evaluated models. The paper does not discuss whether model performance is inflated due to benchmark contamination. While C3PO's relative advantage over baselines may be less affected (all methods use the same models), absolute accuracy numbers could be misleading."
    330     },
    331     {
    332       "flag": "No statistical significance tests for comparative claims",
    333       "detail": "The paper claims C3PO 'outperforms' baselines but relies entirely on visual comparison of accuracy-cost curves with confidence intervals. No formal hypothesis tests are conducted to determine whether performance differences are statistically significant."
    334     },
    335     {
    336       "flag": "Vendor affiliation with evaluated models",
    337       "detail": "Two authors are affiliated with Huawei. The QWEN models used in experiments are developed by Alibaba (not Huawei), so there is no direct product evaluation conflict. However, Huawei has commercial interest in efficient LLM inference, and no competing interests statement is provided."
    338     }
    339   ],
    340   "cited_papers": [
    341     {
    342       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    343       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    344       "year": 2024,
    345       "relevance": "Foundational work on LLM cascading with supervised meta-models for cost reduction, directly compared as a baseline."
    346     },
    347     {
    348       "title": "Large Language Model Cascades with Mixture of Thought Representations for Cost-Efficient Reasoning",
    349       "authors": ["Murong Yue", "Jie Zhao", "Min Zhang", "Liang Du", "Ziyu Yao"],
    350       "year": 2024,
    351       "relevance": "Proposes unsupervised LLM cascading using self-consistency voting, a key baseline in this work."
    352     },
    353     {
    354       "title": "Efficient Contextual LLM Cascades through Budget-Constrained Policy Learning",
    355       "authors": ["Xuechen Zhang", "Zijian Huang", "Ege Onur Taga", "Carlee Joe-Wong", "Samet Oymak", "Jiasi Chen"],
    356       "year": 2024,
    357       "relevance": "TREACLE: RL-based cascade learning with budget constraints, a primary supervised baseline."
    358     },
    359     {
    360       "title": "Do We Truly Need So Many Samples? Multi-LLM Repeated Sampling Efficiently Scale Test-Time Compute",
    361       "authors": ["Jianhao Chen", "Zishuo Xun", "Bocheng Zhou"],
    362       "year": 2025,
    363       "relevance": "ModelSwitch: unsupervised early exit using self-consistency, directly compared as a baseline."
    364     },
    365     {
    366       "title": "RouteLLM: Learning to Route LLMs from Preference Data",
    367       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"],
    368       "year": 2025,
    369       "relevance": "Prominent LLM routing approach that makes pre-inference model selection decisions, contrasted with cascade methods."
    370     },
    371     {
    372       "title": "AutoMix: Automatically Mixing Language Models",
    373       "authors": ["Pranjal Aggarwal", "Aman Madaan", "Ankit Anand"],
    374       "year": 2024,
    375       "relevance": "LLM cascade method using self-consistency sampling with a verifier model, positioned as an ensemble/cascade hybrid."
    376     },
    377     {
    378       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    379       "authors": ["DeepSeek-AI"],
    380       "year": 2025,
    381       "relevance": "Demonstrates RL-based CoT pruning in a single model; C3PO notes such models can participate in its cascade framework."
    382     },
    383     {
    384       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    385       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    386       "year": 2022,
    387       "relevance": "Foundational work on chain-of-thought prompting that underpins the reasoning methodology evaluated in this paper."
    388     },
    389     {
    390       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    391       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    392       "year": 2023,
    393       "relevance": "Introduces self-consistency sampling for CoT reasoning, the core confidence signal exploited by cascade methods including C3PO."
    394     },
    395     {
    396       "title": "Language Model Cascades: Token-Level Uncertainty and Beyond",
    397       "authors": ["Neha Gupta", "Harikrishna Narasimhan", "Wittawat Jitkrittum"],
    398       "year": 2024,
    399       "relevance": "Uses quantile features from token-level confidences for cascade decisions; requires model internals unlike C3PO."
    400     }
    401   ]
    402 }

Impressum · Datenschutz