scan.json (18945B)
1 { 2 "paper": { 3 "title": "Conformal Constrained Policy Optimization for Cost-Effective LLM Agents", 4 "authors": ["Wenwen Si", "Sooyong Jang", "Insup Lee", "Osbert Bastani"], 5 "year": 2025, 6 "venue": "arXiv preprint (AAAI 2026 submission)", 7 "arxiv_id": "2511.11828" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No repository URL, GitHub link, or code archive is mentioned in the paper." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses publicly available benchmarks: HotpotQA (Yang et al. 2018) and MMLU (Hendrycks et al. 2021)." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions LLaMA-2-7B (8-bit) and GPT-4o but not software dependencies or library versions." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions, README, or scripts are provided." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": true, 36 "justification": "Box plots over 100 random evaluation splits are provided in Figures 2-5, showing distribution of coverage and cost." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims CCPO outperforms baselines but provides no statistical significance tests (no p-values, t-tests, etc.)." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports percentage cost reductions with context, e.g., 'CCPO reduces cost by 12% to 27% compared to this baseline' with absolute cost figures in tables." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "Training set sizes (1000 for HotpotQA, 560 for MMLU) and test set size (200) are stated but not justified. No power analysis or discussion of whether these sizes are adequate." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Box plots over 100 random evaluation splits (Figures 2-5) show variance in coverage and cost across splits." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Multiple baselines are included: Random policy, LLM policies (GPT-4o, LLaMA variants), LLM-EXIT, UALA, CPO, CPO batch, CPO online (Table 1)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include recent methods: UALA (2024), LLM-EXIT (Lu et al. 2025), SWEET-RL (Zhou et al. 2025), CPO variants. These are contemporary." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "CPO vs CPO batch vs CPO online vs CCPO effectively ablates the contribution of conformal prediction and online threshold calibration. CCPO is tested with different lambda values (0, 2e-4, 1e-4)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Four metrics are reported: Cost, Coverage, Avg. Len., and Set Size (Section 5.1 Metrics)." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "The paper evaluates automated QA performance on benchmarks with ground-truth answers. Human evaluation is not relevant to these claims." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "Section 5.1: 'we use 200 examples for batch calibration and 200 for testing.' Explicit train/calibration/test splits." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down by dataset (HotpotQA, MMLU), by base model (LLaMA-2-7B, LLaMA-3.2-3B), and by alpha level (0.1, 0.2, 0.05) across Tables 2-9." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 5.2 discusses CPO failures: 'CPO often fails to find an optimal policy, especially at higher coverage levels' and sensitivity to noisy/limited data causing 'severe divergence during training.'" 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that smaller LLMs as policies 'exhibit limited ability to assess the correctness of responses... often performing comparably to random.' CPO failures at higher coverage are also reported." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims 'up to 30% cost reduction compared to other cost-aware baselines.' Tables show CCPO achieving 12-27% cost reduction vs CPO batch/online baselines, and larger savings vs LLM methods. The 30% claim is supported by the data." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims are mainly ablation-style: comparing CCPO to CPO variants isolates the contribution of conformal prediction integration. The controlled single-variable manipulation (adding/removing conformal components) is adequate." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title and abstract claim 'Cost-Effective LLM Agents' and 'a principled and practical framework for deploying LLM agents' but results are only on two QA benchmarks with specific model pairs. The generalization to other agent tasks is not bounded." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No discussion of alternative explanations for the results. Section 6 discusses technical design choices (clipping, stochastic policy) but not confounds or alternative explanations for observed improvements." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper says 'GPT-4o' without a snapshot date or API version. LLaMA-2-7B and LLaMA-3.2-3B are specified by size but GPT-4o lacks version specificity." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Full prompt templates are provided in Appendix 8: guide model template (8.1), LLM EXIT template (8.2), and LLM-as-a-Policy template (8.3) with actual text." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section 5.1 Training: learning rate 10^-3, batch size 10, KL constraint delta=0.01, epsilon=0.01, conformal decay xi=0.1, temperature 0.0/1.0 for MMLU, 3-layer NN with 64 hidden units, 1500-2000 training steps." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The orchestration framework is described in detail in Section 3 (LLM Agent Orchestration): base agent generates reasoning trace, guide agent evaluates, policy selects among three actions. The POMDP formulation describes the full workflow." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": false, 152 "justification": "No description of how HotpotQA and MMLU examples were selected or preprocessed. The paper states training/calibration/test split sizes but not selection criteria or any filtering applied." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "No dedicated limitations or threats-to-validity section. The conclusion is brief with no discussion of limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No specific threats to validity are discussed anywhere in the paper." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what the results do NOT show or what settings are excluded. The broad claims about 'LLM agents' are not bounded to the tested QA setting." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw experimental data (per-example results, training logs) is made available for independent verification." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "The data sources are described: HotpotQA and MMLU are standard benchmarks. Training/calibration/test split sizes are specified (Section 5.1)." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants; uses standard benchmarks." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": false, 191 "justification": "The pipeline from raw benchmark examples to final results is not documented. How the 1000/560 training examples were selected from the full datasets is not described." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Acknowledgements section: 'This work was supported in part by NIH R01EY037101 and NSF Award CCF-2338777.'" 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "All authors are from University of Pennsylvania, Department of Computer and Information Science. No product being evaluated is affiliated with their institution." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "Funding is from NIH and NSF, which are independent government agencies with no stake in the outcome of LLM cost optimization research." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "GPT-4o is used as the guide model and its training data cutoff is not stated. HotpotQA and MMLU are well-known benchmarks that could be in GPT-4o's training data." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether GPT-4o may have seen HotpotQA or MMLU examples during training." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "HotpotQA (2018) and MMLU (2021) were both published well before GPT-4o's training cutoff. No contamination discussion is provided." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Cost is a primary metric. Tables 2-9 report cumulative API cost in cents for all methods across all experimental configurations." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No mention of total GPU hours, training time, or total API spend for the experiments. Training details (1500-2000 steps) are given but not the computational cost." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "CCPO achieves up to 30% cost reduction compared to cost-aware baselines while maintaining target coverage.", 286 "evidence": "Tables 2-5 show CCPO reducing cost by 12-27% vs CPO batch/online across HotpotQA and MMLU with different base models, while meeting coverage targets.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "CCPO provides coverage guarantees via conformal prediction that pointwise methods cannot match.", 291 "evidence": "Tables 2-5 show CPO (pointwise) consistently fails to meet coverage targets (e.g., 0.832 vs 0.9 target in Table 2), while CCPO achieves >=0.9 coverage. Theoretical guarantees in Section 4.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "LLM-based policies using small models perform comparably to random policies.", 296 "evidence": "Tables 2-5: LLaMA-2-7B as policy achieves 0.675 coverage vs Random's 0.578 (Table 2), and LLaMA-3.2-3B achieves 0.615 vs Random's 0.61 (Table 3). Section 5.2 discusses this.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "CCPO delivers more stable training compared to CPO under data challenges.", 301 "evidence": "Section 5.2 states this with reference to Appendix 10 training curves, but the training curves are not shown in the main text.", 302 "supported": "moderate" 303 } 304 ], 305 "methodology_tags": ["benchmark-eval", "theoretical"], 306 "key_findings": "CCPO combines constrained policy optimization with online conformal prediction to orchestrate cheap and expensive LLM agents for cost-effective question answering. On HotpotQA and MMLU, CCPO reduces API costs by 12-27% compared to CPO-based baselines while maintaining user-specified coverage guarantees. The method formalizes the orchestration problem as a POMDP and provides theoretical coverage guarantees via conformal prediction. Small LLMs used as policy controllers perform poorly, suggesting learned RL policies are more effective than LLM-based decision-making for this orchestration task.", 307 "red_flags": [ 308 { 309 "flag": "No limitations section", 310 "detail": "The paper has no discussion of limitations, threats to validity, or scope boundaries despite making broad claims about 'cost-effective LLM agents' based on two QA benchmarks." 311 }, 312 { 313 "flag": "Benchmark contamination risk unaddressed", 314 "detail": "GPT-4o is used as the guide model on HotpotQA (2018) and MMLU (2021), both likely in its training data. The coverage guarantees may be inflated if GPT-4o already knows the answers." 315 }, 316 { 317 "flag": "No significance tests", 318 "detail": "Claims of superiority over baselines rely on point comparisons in tables without statistical tests, despite having 100-split box plot data that could support such tests." 319 } 320 ], 321 "cited_papers": [ 322 { 323 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 324 "authors": ["L. Chen", "M. Zaharia", "J. Zou"], 325 "year": 2024, 326 "relevance": "Directly addresses LLM cost reduction through cascading and caching, a key baseline for cost-effective LLM deployment." 327 }, 328 { 329 "title": "Towards Uncertainty-Aware Language Agent", 330 "authors": ["J. Han", "W. Buntine", "E. Shareghi"], 331 "year": 2024, 332 "relevance": "Proposes uncertainty-aware LLM agents (UALA), a key baseline in this paper for uncertainty-guided model selection." 333 }, 334 { 335 "title": "SWEET-RL: Training Multi-Turn LLM Agents on Collaborative Reasoning Tasks", 336 "authors": ["Y. Zhou", "S. Jiang", "Y. Tian", "J. Weston", "S. Levine", "S. Sukhbaatar", "X. Li"], 337 "year": 2025, 338 "arxiv_id": "2503.15478", 339 "relevance": "RL-based multi-turn LLM agent training, related approach to training agentic orchestration policies." 340 }, 341 { 342 "title": "Reflect, Retry, Reward: Self-Improving LLMs via Reinforcement Learning", 343 "authors": ["S. Bensal", "U. Jamil", "C. Bryant", "M. Russak", "K. Kamble", "D. Mozolevskyi", "M. Ali", "W. AlShikh"], 344 "year": 2025, 345 "arxiv_id": "2505.24726", 346 "relevance": "Self-improvement via RL for LLM agents, related work on RL-based agent training." 347 }, 348 { 349 "title": "Runaway is Ashamed, But Helpful: On the Early-Exit Behavior of Large Language Model-based Agents in Embodied Environments", 350 "authors": ["Q. Lu", "L. Ding", "S. Cao", "X. Liu", "K. Zhang", "J. Zhang", "D. Tao"], 351 "year": 2025, 352 "relevance": "Early-exit strategies for LLM agents to reduce cost, directly relevant to cost-effective agent deployment." 353 }, 354 { 355 "title": "Chain-of-thought prompting elicits reasoning in large language models", 356 "authors": ["J. Wei", "X. Wang", "D. Schuurmans", "M. Bosma", "F. Xia", "E. Chi", "Q. V. Le", "D. Zhou"], 357 "year": 2022, 358 "relevance": "Foundational prompting technique used as part of the base agent reasoning strategy in this work." 359 }, 360 { 361 "title": "ReAct: Synergizing reasoning and acting in language models", 362 "authors": ["S. Yao", "J. Zhao", "D. Yu", "N. Du", "I. Shafran", "K. Narasimhan", "Y. Cao"], 363 "year": 2023, 364 "relevance": "Key agentic reasoning framework combining reasoning with tool use, foundational to LLM agent design." 365 }, 366 { 367 "title": "Self-Regulation and Requesting Interventions", 368 "authors": ["S. Y. Min", "Y. Wu", "J. Sun", "M. Kaufmann", "F. Tajwar", "Y. Bisk", "R. Salakhutdinov"], 369 "year": 2025, 370 "arxiv_id": "2502.04576", 371 "relevance": "Process reward models for training LLM agent help-seeking policies, related approach to learned orchestration." 372 } 373 ] 374 }