scan.json (23424B)
1 { 2 "paper": { 3 "title": "ACAR: Adaptive Complexity Routing for Multi-Model Ensembles with Auditable Decision Traces", 4 "authors": [ 5 "Ramchand Kumaresan" 6 ], 7 "year": 2026, 8 "arxiv_id": "2602.21231" 9 }, 10 "scan_version": 3, 11 "active_modules": [ 12 "experimental_rigor", 13 "data_leakage" 14 ], 15 "methodology_tags": [ 16 "benchmark-eval" 17 ], 18 "key_findings": "ACAR uses self-consistency variance from 3 probe samples to route tasks across single-model, two-model, and three-model execution modes, achieving 55.6% accuracy on 1,510 tasks (exceeding the two-model baseline of 54.4%) while avoiding full ensembling on 54.2% of tasks. Three significant negative results are documented: retrieval augmentation decreased accuracy by 3.4pp due to low semantic similarity (median 0.167), agreement-but-wrong failures create an 8pp ceiling below full ensembling, and attribution proxies showed weak correlation with ground-truth leave-one-out values.", 19 "claims": [ 20 { 21 "claim": "σ-based routing achieves 55.6% accuracy, exceeding the two-model baseline (54.4%) while avoiding full ensembling on 54.2% of tasks", 22 "evidence": "Table 1 shows ACAR-U at 55.6% (839/1510) vs Arena-2 at 54.4% (822/1510). Figure 6 shows 54.2% of tasks avoid full-arena mode.", 23 "supported": "moderate" 24 }, 25 { 26 "claim": "Retrieval augmentation decreased accuracy by 3.4 percentage points across all benchmarks", 27 "evidence": "Table 2 shows per-benchmark decreases: MathArena -5.0pp, Reasoning Gym -2.0pp, LiveCodeBench -4.0pp, SuperGPQA -3.2pp, overall 55.6% → 52.4%.", 28 "supported": "strong" 29 }, 30 { 31 "claim": "Agreement-but-wrong failure mode bounds achievable accuracy at 8pp below full ensembling", 32 "evidence": "Section 6.2 discusses this: when σ=0 and the answer is wrong, ACAR routes to single-agent and cannot recover. The 8pp gap (55.6% vs 63.6%) is shown in Table 1.", 33 "supported": "moderate" 34 }, 35 { 36 "claim": "Attribution proxies (response similarity, entropy) showed weak correlation with ground-truth leave-one-out values", 37 "evidence": "Section 6.3 states this but provides no quantitative correlation values or figures showing the weak correlation.", 38 "supported": "weak" 39 } 40 ], 41 "red_flags": [ 42 { 43 "flag": "No statistical tests for main comparison", 44 "detail": "The 1.2pp improvement of ACAR-U over Arena-2 (55.6% vs 54.4%) is presented without any significance test. On 1,510 tasks this difference may not be statistically significant." 45 }, 46 { 47 "flag": "Attribution claim underspecified", 48 "detail": "Section 6.3 claims attribution proxies don't work but provides no quantitative data (correlation coefficients, scatter plots). This is the weakest-evidenced claim in the paper." 49 }, 50 { 51 "flag": "Single-run evaluation", 52 "detail": "Despite using temperature 0 for determinism, no seed sensitivity analysis is reported. Different probe samples could yield different routing decisions." 53 }, 54 { 55 "flag": "Benchmark composition bias", 56 "detail": "SuperGPQA comprises 66% of tasks (1,000/1,510). The overall accuracy is dominated by this single benchmark, acknowledged in limitations but not corrected for in headline numbers." 57 } 58 ], 59 "checklist": { 60 "artifacts": { 61 "code_released": { 62 "applies": true, 63 "answer": true, 64 "justification": "GitHub repository URL provided: https://github.com/mechramc/ACAR-TeamLLM. Paper states 'Code and artifacts are publicly available' (Section 1.2 footnote)." 65 }, 66 "data_released": { 67 "applies": true, 68 "answer": true, 69 "justification": "Appendix B lists complete artifact manifest including runs.jsonl for all configurations (ACAR-U, ACAR-UJ, Arena-3, Arena-2, single-model). The benchmarks used are public (MathArena, Reasoning Gym, LiveCodeBench, SuperGPQA)." 70 }, 71 "environment_specified": { 72 "applies": true, 73 "answer": false, 74 "justification": "No requirements.txt, Dockerfile, conda environment, or library versions mentioned. Appendix A mentions 'environment fingerprint' in logs but does not describe the actual environment." 75 }, 76 "reproduction_instructions": { 77 "applies": true, 78 "answer": true, 79 "justification": "Appendix A states 'All figures regenerable from released artifacts' with figure regeneration scripts. Appendix B provides the artifact directory structure. 208 unit tests for infrastructure validation are mentioned." 80 } 81 }, 82 "statistical_methodology": { 83 "confidence_intervals_or_error_bars": { 84 "applies": true, 85 "answer": false, 86 "justification": "All results are point estimates. Table 1 shows raw accuracy percentages (e.g., 55.6%) with no confidence intervals or error bars." 87 }, 88 "significance_tests": { 89 "applies": true, 90 "answer": false, 91 "justification": "The paper claims ACAR-U 'exceeds' Arena-2 (55.6% vs 54.4%) without any significance test. No p-values, bootstrap tests, or other statistical tests are reported." 92 }, 93 "effect_sizes_reported": { 94 "applies": true, 95 "answer": true, 96 "justification": "Effect sizes are reported with baseline context throughout: '1.2 percentage points' improvement over Arena-2, '-3.4pp' for retrieval, '8pp gap' to Arena-3. Table 2 provides per-benchmark deltas." 97 }, 98 "sample_size_justified": { 99 "applies": true, 100 "answer": false, 101 "justification": "No justification for the 1,510 task count or the per-benchmark counts (60, 250, 200, 1000). No power analysis or discussion of whether 60 MathArena tasks is sufficient." 102 }, 103 "variance_reported": { 104 "applies": true, 105 "answer": false, 106 "justification": "Results are from single experimental runs. No variance, standard deviation, or spread measures across runs are reported." 107 } 108 }, 109 "evaluation_design": { 110 "baselines_included": { 111 "applies": true, 112 "answer": true, 113 "justification": "Three baselines: Single-Model (best single model), Arena-2 (two-model ensemble), Arena-3 (three-model ensemble). Table 1 compares all configurations." 114 }, 115 "baselines_contemporary": { 116 "applies": true, 117 "answer": false, 118 "justification": "No comparison against learned routing systems (RouterBench, FrugalGPT, RouteLLM) discussed in related work. The baselines are naive fixed-ensemble strategies, not state-of-the-art routing methods." 119 }, 120 "ablation_study": { 121 "applies": true, 122 "answer": true, 123 "justification": "ACAR-U vs ACAR-UJ is an ablation of the retrieval component. Per-benchmark breakdowns and escalation analysis further isolate component contributions." 124 }, 125 "multiple_metrics": { 126 "applies": true, 127 "answer": true, 128 "justification": "Reports accuracy, cost (USD), escalation rate, and latency (ms) across configurations." 129 }, 130 "human_evaluation": { 131 "applies": false, 132 "answer": false, 133 "justification": "The paper evaluates automated benchmark performance with execution-verified answers. Human evaluation is not relevant to the claims about routing accuracy and cost." 134 }, 135 "held_out_test_set": { 136 "applies": true, 137 "answer": true, 138 "justification": "The benchmarks are external test sets not used for any tuning. ACAR uses no learned parameters that could overfit." 139 }, 140 "per_category_breakdown": { 141 "applies": true, 142 "answer": true, 143 "justification": "Figure 3 and Table 2 provide per-benchmark breakdowns. Figure 5 shows escalation distribution by benchmark." 144 }, 145 "failure_cases_discussed": { 146 "applies": true, 147 "answer": true, 148 "justification": "Section 6 is entirely dedicated to failure modes: retrieval hurting performance (6.1), agreement-but-wrong (6.2), and attribution proxy failure (6.3)." 149 }, 150 "negative_results_reported": { 151 "applies": true, 152 "answer": true, 153 "justification": "Three significant negative results are prominently reported in Section 6. This is a notable strength of the paper." 154 } 155 }, 156 "claims_and_evidence": { 157 "abstract_claims_supported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Abstract claims (55.6% accuracy, -3.4pp retrieval, 8pp gap, weak attribution correlation) are supported by results. The abstract explicitly states what does not hold." 161 }, 162 "causal_claims_justified": { 163 "applies": true, 164 "answer": true, 165 "justification": "Causal claims like 'retrieval augmentation decreased accuracy' are supported by controlled comparison (ACAR-U vs ACAR-UJ, same setup minus retrieval). The ablation design is adequate." 166 }, 167 "generalization_bounded": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 8 explicitly bounds scope: 'Three models from major providers; may not generalize to open-source models.' SuperGPQA dominance acknowledged. Paper frames itself as a measurement framework." 171 }, 172 "alternative_explanations_discussed": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 6 discusses why retrieval fails (low similarity, not inherent limitation), why accuracy ceiling exists (intrinsic to self-consistency), and Section 8 notes code equivalence inflating LiveCodeBench escalation." 176 }, 177 "proxy_outcome_distinction": { 178 "applies": true, 179 "answer": true, 180 "justification": "The paper measures accuracy on four benchmarks and frames results precisely at that granularity: '55.6% accuracy' on specific tasks. It explicitly discusses what accuracy does NOT capture (Section 8 limitations). The paper frames itself as 'a measurement framework' rather than claiming broader routing effectiveness." 181 } 182 }, 183 "setup_transparency": { 184 "model_versions_specified": { 185 "applies": true, 186 "answer": false, 187 "justification": "Models listed as 'Claude Sonnet 4', 'GPT-4o', 'Gemini 2.0 Flash' without specific API versions or snapshot dates." 188 }, 189 "prompts_provided": { 190 "applies": true, 191 "answer": false, 192 "justification": "No prompt text is provided in the paper or appendix. Algorithm 1 describes the routing procedure but not the actual prompts sent to models." 193 }, 194 "hyperparameters_reported": { 195 "applies": true, 196 "answer": true, 197 "justification": "Temperature 0 stated for deterministic evaluation (Section 4.2). N=3 probe samples specified. Retrieval similarity threshold of 0.0 stated for ACAR-UJ." 198 }, 199 "scaffolding_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "Algorithm 1 provides the complete routing procedure. Section 3.1 describes TEAMLLM substrate with deterministic execution, immutable artifacts, and state machine. Section 3.2 details σ-based routing." 203 }, 204 "data_preprocessing_documented": { 205 "applies": true, 206 "answer": false, 207 "justification": "No description of how 1,510 tasks were selected from the four benchmarks. No sampling methodology for the 1,000 SuperGPQA tasks or 60 MathArena tasks." 208 } 209 }, 210 "limitations_and_scope": { 211 "limitations_section_present": { 212 "applies": true, 213 "answer": true, 214 "justification": "Section 8 'Limitations' lists four specific limitations: model set, benchmark bias, no learned routing, and code equivalence." 215 }, 216 "threats_to_validity_specific": { 217 "applies": true, 218 "answer": true, 219 "justification": "Limitations are specific: 'SuperGPQA dominates (66% of tasks)', 'LiveCodeBench escalation is inflated by syntactically different but semantically equivalent outputs', three-model limitation." 220 }, 221 "scope_boundaries_stated": { 222 "applies": true, 223 "answer": true, 224 "justification": "Section 8 states what results do not show: does not generalize to open-source models, learned routers may outperform on specific distributions." 225 } 226 }, 227 "data_integrity": { 228 "raw_data_available": { 229 "applies": true, 230 "answer": true, 231 "justification": "Appendix B lists complete artifact manifest with runs.jsonl containing per-task decision traces for all configurations. 7,550+ auditable runs released." 232 }, 233 "data_collection_described": { 234 "applies": true, 235 "answer": true, 236 "justification": "Section 4.1 describes the four benchmarks, their task counts, and types. Section 3.1 describes TEAMLLM execution substrate." 237 }, 238 "recruitment_methods_described": { 239 "applies": false, 240 "answer": false, 241 "justification": "No human participants. Data comes from standard benchmarks." 242 }, 243 "data_pipeline_documented": { 244 "applies": true, 245 "answer": true, 246 "justification": "Section 3.1 describes the TEAMLLM pipeline: PENDING → EXECUTING → VERIFYING → COMPLETED. Algorithm 1 shows the complete procedure. Appendix A confirms zero parse errors." 247 } 248 }, 249 "conflicts_of_interest": { 250 "funding_disclosed": { 251 "applies": true, 252 "answer": false, 253 "justification": "No funding source or acknowledgments section mentioning grants or sponsors." 254 }, 255 "affiliations_disclosed": { 256 "applies": true, 257 "answer": false, 258 "justification": "Author listed as 'Ramchand Kumaresan' with no institutional affiliation provided." 259 }, 260 "funder_independent_of_outcome": { 261 "applies": true, 262 "answer": false, 263 "justification": "No funding information disclosed, so independence cannot be assessed." 264 }, 265 "financial_interests_declared": { 266 "applies": true, 267 "answer": false, 268 "justification": "No competing interests statement or financial disclosure present in the paper." 269 } 270 }, 271 "contamination": { 272 "training_cutoff_stated": { 273 "applies": true, 274 "answer": false, 275 "justification": "No training data cutoff dates stated for any of the three models used." 276 }, 277 "train_test_overlap_discussed": { 278 "applies": true, 279 "answer": false, 280 "justification": "No discussion of whether benchmark tasks may have appeared in model training data." 281 }, 282 "benchmark_contamination_addressed": { 283 "applies": true, 284 "answer": false, 285 "justification": "No contamination analysis for any of the four benchmarks. LiveCodeBench uses temporal splits which helps but this is not discussed in the context of contamination." 286 } 287 }, 288 "human_studies": { 289 "pre_registered": { 290 "applies": false, 291 "answer": false, 292 "justification": "No human participants in this study." 293 }, 294 "irb_or_ethics_approval": { 295 "applies": false, 296 "answer": false, 297 "justification": "No human participants in this study." 298 }, 299 "demographics_reported": { 300 "applies": false, 301 "answer": false, 302 "justification": "No human participants in this study." 303 }, 304 "inclusion_exclusion_criteria": { 305 "applies": false, 306 "answer": false, 307 "justification": "No human participants in this study." 308 }, 309 "randomization_described": { 310 "applies": false, 311 "answer": false, 312 "justification": "No human participants in this study." 313 }, 314 "blinding_described": { 315 "applies": false, 316 "answer": false, 317 "justification": "No human participants in this study." 318 }, 319 "attrition_reported": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants in this study." 323 } 324 }, 325 "cost_and_practicality": { 326 "inference_cost_reported": { 327 "applies": true, 328 "answer": true, 329 "justification": "Table 1 reports total cost in USD for each configuration: Single-Model $17.04, Arena-2 $20.64, ACAR-U $20.34, Arena-3 $20.64." 330 }, 331 "compute_budget_stated": { 332 "applies": true, 333 "answer": true, 334 "justification": "Total API costs stated per configuration. 7,550+ total runs documented. Latency reported in Figure 7." 335 } 336 }, 337 "experimental_rigor": { 338 "seed_sensitivity_reported": { 339 "applies": true, 340 "answer": false, 341 "justification": "No multi-seed analysis. Temperature 0 used for determinism but no seed sensitivity study." 342 }, 343 "number_of_runs_stated": { 344 "applies": true, 345 "answer": true, 346 "justification": "7,550+ total runs explicitly stated. 1,510 tasks per configuration. N=3 probe samples per task." 347 }, 348 "hyperparameter_search_budget": { 349 "applies": true, 350 "answer": false, 351 "justification": "No hyperparameter search reported. N=3, σ thresholds, and retrieval similarity threshold of 0.0 appear chosen without systematic search." 352 }, 353 "best_config_selection_justified": { 354 "applies": true, 355 "answer": true, 356 "justification": "All configurations reported and compared in Table 1. Both positive and negative results shown. No cherry-picking." 357 }, 358 "multiple_comparison_correction": { 359 "applies": false, 360 "answer": false, 361 "justification": "No statistical tests performed, so multiple comparison correction is not applicable." 362 }, 363 "self_comparison_bias_addressed": { 364 "applies": true, 365 "answer": false, 366 "justification": "Authors evaluate their own ACAR system against baselines they implemented. No acknowledgment of self-comparison bias." 367 }, 368 "compute_budget_vs_performance": { 369 "applies": true, 370 "answer": true, 371 "justification": "Figure 4 shows cost vs accuracy Pareto frontier. Table 1 reports both accuracy and cost for all configurations." 372 }, 373 "benchmark_construct_validity": { 374 "applies": true, 375 "answer": false, 376 "justification": "No discussion of whether the four benchmarks actually measure the capabilities ACAR claims to route for." 377 }, 378 "scaffold_confound_addressed": { 379 "applies": true, 380 "answer": true, 381 "justification": "The paper explicitly addresses scaffolding as a variable: ACAR-U vs ACAR-UJ ablates the retrieval scaffold, and the routing mechanism itself is the scaffold being studied. The paper does not compare models across different scaffolds and attribute differences to models — it evaluates the scaffold (routing) as the primary variable." 382 } 383 }, 384 "data_leakage": { 385 "temporal_leakage_addressed": { 386 "applies": true, 387 "answer": false, 388 "justification": "No discussion of whether benchmark problems existed before models' training cutoffs." 389 }, 390 "feature_leakage_addressed": { 391 "applies": true, 392 "answer": false, 393 "justification": "No discussion of whether the evaluation setup leaks information through routing or probe context." 394 }, 395 "non_independence_addressed": { 396 "applies": true, 397 "answer": false, 398 "justification": "No discussion of train/test independence across the four benchmarks." 399 }, 400 "leakage_detection_method": { 401 "applies": true, 402 "answer": false, 403 "justification": "No leakage detection or prevention method applied." 404 } 405 } 406 }, 407 "cited_papers": [ 408 { 409 "title": "RouterBench: A benchmark for multi-LLM routing system", 410 "authors": [ 411 "Qitian Jason Hu", 412 "Jacob Bieker", 413 "Xiuyu Li", 414 "Nan Jiang", 415 "Benjamin Keigwin", 416 "Gaurav Ranganath", 417 "Kurt Keutzer", 418 "Shriyash Kaustubh Upadhyay" 419 ], 420 "year": 2024, 421 "arxiv_id": "2403.12031", 422 "relevance": "Benchmark for evaluating LLM routing systems, directly comparable to ACAR's routing approach." 423 }, 424 { 425 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 426 "authors": [ 427 "Lingjiao Chen", 428 "Matei Zaharia", 429 "James Zou" 430 ], 431 "year": 2023, 432 "arxiv_id": "2305.05176", 433 "relevance": "Cascading routing strategy for cost-efficient LLM deployment, a key baseline approach for multi-model orchestration." 434 }, 435 { 436 "title": "RouteLLM: Learning to route LLMs with preference data", 437 "authors": [ 438 "Isaac Ong", 439 "Amjad Almahairi", 440 "Vincent Wu", 441 "Wei-Lin Chiang", 442 "Tianhao Wu", 443 "Joseph E. Gonzalez", 444 "M Waleed Kadous", 445 "Ion Stoica" 446 ], 447 "year": 2025, 448 "relevance": "Learned router using preference data, representing state-of-the-art in LLM routing that ACAR deliberately avoids." 449 }, 450 { 451 "title": "ReAct: Synergizing reasoning and acting in language models", 452 "authors": [ 453 "Shunyu Yao", 454 "Jeffrey Zhao", 455 "Dian Yu", 456 "Nan Du", 457 "Izhak Shafran", 458 "Karthik Narasimhan", 459 "Yuan Cao" 460 ], 461 "year": 2023, 462 "relevance": "Foundational work on tool-augmented LLM agents, relevant to multi-model coordination approaches." 463 }, 464 { 465 "title": "A survey on mixture of experts in large language models", 466 "authors": [ 467 "Weilin Cai", 468 "Juyong Jiang", 469 "Fan Wang", 470 "Jing Tang", 471 "Sunghun Kim", 472 "Jiayi Huang" 473 ], 474 "year": 2024, 475 "arxiv_id": "2407.06204", 476 "relevance": "Surveys intra-model routing via MoE, contrasting with ACAR's inter-model routing approach." 477 }, 478 { 479 "title": "The Shapley value in machine learning", 480 "authors": [ 481 "Benedek Rozemberczki", 482 "Lauren Watson", 483 "Péter Bayer" 484 ], 485 "year": 2022, 486 "relevance": "Attribution methodology for ML model contributions, relevant to ACAR's failed attribution proxy experiments." 487 } 488 ], 489 "engagement_factors": { 490 "practical_relevance": { 491 "score": 1, 492 "justification": "The routing concept is potentially useful but the specific implementation is tightly coupled to a custom substrate and the accuracy gains are marginal (1.2pp)." 493 }, 494 "surprise_contrarian": { 495 "score": 2, 496 "justification": "The finding that retrieval augmentation consistently hurts performance (-3.4pp) is counterintuitive and challenges the 'more context is better' assumption prevalent in RAG discourse." 497 }, 498 "fear_safety": { 499 "score": 0, 500 "justification": "No safety, security, or risk angle is present in the paper." 501 }, 502 "drama_conflict": { 503 "score": 0, 504 "justification": "No controversy, company criticism, or replication failure — the paper is a straightforward technical evaluation." 505 }, 506 "demo_ability": { 507 "score": 1, 508 "justification": "Code and artifacts are released on GitHub but require significant setup with three paid API providers to reproduce." 509 }, 510 "brand_recognition": { 511 "score": 0, 512 "justification": "Single unknown author, no venue, no institutional affiliation listed." 513 } 514 } 515 }