scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23424B)
      1 {
      2   "paper": {
      3     "title": "ACAR: Adaptive Complexity Routing for Multi-Model Ensembles with Auditable Decision Traces",
      4     "authors": [
      5       "Ramchand Kumaresan"
      6     ],
      7     "year": 2026,
      8     "arxiv_id": "2602.21231"
      9   },
     10   "scan_version": 3,
     11   "active_modules": [
     12     "experimental_rigor",
     13     "data_leakage"
     14   ],
     15   "methodology_tags": [
     16     "benchmark-eval"
     17   ],
     18   "key_findings": "ACAR uses self-consistency variance from 3 probe samples to route tasks across single-model, two-model, and three-model execution modes, achieving 55.6% accuracy on 1,510 tasks (exceeding the two-model baseline of 54.4%) while avoiding full ensembling on 54.2% of tasks. Three significant negative results are documented: retrieval augmentation decreased accuracy by 3.4pp due to low semantic similarity (median 0.167), agreement-but-wrong failures create an 8pp ceiling below full ensembling, and attribution proxies showed weak correlation with ground-truth leave-one-out values.",
     19   "claims": [
     20     {
     21       "claim": "σ-based routing achieves 55.6% accuracy, exceeding the two-model baseline (54.4%) while avoiding full ensembling on 54.2% of tasks",
     22       "evidence": "Table 1 shows ACAR-U at 55.6% (839/1510) vs Arena-2 at 54.4% (822/1510). Figure 6 shows 54.2% of tasks avoid full-arena mode.",
     23       "supported": "moderate"
     24     },
     25     {
     26       "claim": "Retrieval augmentation decreased accuracy by 3.4 percentage points across all benchmarks",
     27       "evidence": "Table 2 shows per-benchmark decreases: MathArena -5.0pp, Reasoning Gym -2.0pp, LiveCodeBench -4.0pp, SuperGPQA -3.2pp, overall 55.6% → 52.4%.",
     28       "supported": "strong"
     29     },
     30     {
     31       "claim": "Agreement-but-wrong failure mode bounds achievable accuracy at 8pp below full ensembling",
     32       "evidence": "Section 6.2 discusses this: when σ=0 and the answer is wrong, ACAR routes to single-agent and cannot recover. The 8pp gap (55.6% vs 63.6%) is shown in Table 1.",
     33       "supported": "moderate"
     34     },
     35     {
     36       "claim": "Attribution proxies (response similarity, entropy) showed weak correlation with ground-truth leave-one-out values",
     37       "evidence": "Section 6.3 states this but provides no quantitative correlation values or figures showing the weak correlation.",
     38       "supported": "weak"
     39     }
     40   ],
     41   "red_flags": [
     42     {
     43       "flag": "No statistical tests for main comparison",
     44       "detail": "The 1.2pp improvement of ACAR-U over Arena-2 (55.6% vs 54.4%) is presented without any significance test. On 1,510 tasks this difference may not be statistically significant."
     45     },
     46     {
     47       "flag": "Attribution claim underspecified",
     48       "detail": "Section 6.3 claims attribution proxies don't work but provides no quantitative data (correlation coefficients, scatter plots). This is the weakest-evidenced claim in the paper."
     49     },
     50     {
     51       "flag": "Single-run evaluation",
     52       "detail": "Despite using temperature 0 for determinism, no seed sensitivity analysis is reported. Different probe samples could yield different routing decisions."
     53     },
     54     {
     55       "flag": "Benchmark composition bias",
     56       "detail": "SuperGPQA comprises 66% of tasks (1,000/1,510). The overall accuracy is dominated by this single benchmark, acknowledged in limitations but not corrected for in headline numbers."
     57     }
     58   ],
     59   "checklist": {
     60     "artifacts": {
     61       "code_released": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "GitHub repository URL provided: https://github.com/mechramc/ACAR-TeamLLM. Paper states 'Code and artifacts are publicly available' (Section 1.2 footnote)."
     65       },
     66       "data_released": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Appendix B lists complete artifact manifest including runs.jsonl for all configurations (ACAR-U, ACAR-UJ, Arena-3, Arena-2, single-model). The benchmarks used are public (MathArena, Reasoning Gym, LiveCodeBench, SuperGPQA)."
     70       },
     71       "environment_specified": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No requirements.txt, Dockerfile, conda environment, or library versions mentioned. Appendix A mentions 'environment fingerprint' in logs but does not describe the actual environment."
     75       },
     76       "reproduction_instructions": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Appendix A states 'All figures regenerable from released artifacts' with figure regeneration scripts. Appendix B provides the artifact directory structure. 208 unit tests for infrastructure validation are mentioned."
     80       }
     81     },
     82     "statistical_methodology": {
     83       "confidence_intervals_or_error_bars": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "All results are point estimates. Table 1 shows raw accuracy percentages (e.g., 55.6%) with no confidence intervals or error bars."
     87       },
     88       "significance_tests": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "The paper claims ACAR-U 'exceeds' Arena-2 (55.6% vs 54.4%) without any significance test. No p-values, bootstrap tests, or other statistical tests are reported."
     92       },
     93       "effect_sizes_reported": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Effect sizes are reported with baseline context throughout: '1.2 percentage points' improvement over Arena-2, '-3.4pp' for retrieval, '8pp gap' to Arena-3. Table 2 provides per-benchmark deltas."
     97       },
     98       "sample_size_justified": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "No justification for the 1,510 task count or the per-benchmark counts (60, 250, 200, 1000). No power analysis or discussion of whether 60 MathArena tasks is sufficient."
    102       },
    103       "variance_reported": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "Results are from single experimental runs. No variance, standard deviation, or spread measures across runs are reported."
    107       }
    108     },
    109     "evaluation_design": {
    110       "baselines_included": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Three baselines: Single-Model (best single model), Arena-2 (two-model ensemble), Arena-3 (three-model ensemble). Table 1 compares all configurations."
    114       },
    115       "baselines_contemporary": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "No comparison against learned routing systems (RouterBench, FrugalGPT, RouteLLM) discussed in related work. The baselines are naive fixed-ensemble strategies, not state-of-the-art routing methods."
    119       },
    120       "ablation_study": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "ACAR-U vs ACAR-UJ is an ablation of the retrieval component. Per-benchmark breakdowns and escalation analysis further isolate component contributions."
    124       },
    125       "multiple_metrics": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Reports accuracy, cost (USD), escalation rate, and latency (ms) across configurations."
    129       },
    130       "human_evaluation": {
    131         "applies": false,
    132         "answer": false,
    133         "justification": "The paper evaluates automated benchmark performance with execution-verified answers. Human evaluation is not relevant to the claims about routing accuracy and cost."
    134       },
    135       "held_out_test_set": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The benchmarks are external test sets not used for any tuning. ACAR uses no learned parameters that could overfit."
    139       },
    140       "per_category_breakdown": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Figure 3 and Table 2 provide per-benchmark breakdowns. Figure 5 shows escalation distribution by benchmark."
    144       },
    145       "failure_cases_discussed": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Section 6 is entirely dedicated to failure modes: retrieval hurting performance (6.1), agreement-but-wrong (6.2), and attribution proxy failure (6.3)."
    149       },
    150       "negative_results_reported": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Three significant negative results are prominently reported in Section 6. This is a notable strength of the paper."
    154       }
    155     },
    156     "claims_and_evidence": {
    157       "abstract_claims_supported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Abstract claims (55.6% accuracy, -3.4pp retrieval, 8pp gap, weak attribution correlation) are supported by results. The abstract explicitly states what does not hold."
    161       },
    162       "causal_claims_justified": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Causal claims like 'retrieval augmentation decreased accuracy' are supported by controlled comparison (ACAR-U vs ACAR-UJ, same setup minus retrieval). The ablation design is adequate."
    166       },
    167       "generalization_bounded": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 8 explicitly bounds scope: 'Three models from major providers; may not generalize to open-source models.' SuperGPQA dominance acknowledged. Paper frames itself as a measurement framework."
    171       },
    172       "alternative_explanations_discussed": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 6 discusses why retrieval fails (low similarity, not inherent limitation), why accuracy ceiling exists (intrinsic to self-consistency), and Section 8 notes code equivalence inflating LiveCodeBench escalation."
    176       },
    177       "proxy_outcome_distinction": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The paper measures accuracy on four benchmarks and frames results precisely at that granularity: '55.6% accuracy' on specific tasks. It explicitly discusses what accuracy does NOT capture (Section 8 limitations). The paper frames itself as 'a measurement framework' rather than claiming broader routing effectiveness."
    181       }
    182     },
    183     "setup_transparency": {
    184       "model_versions_specified": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "Models listed as 'Claude Sonnet 4', 'GPT-4o', 'Gemini 2.0 Flash' without specific API versions or snapshot dates."
    188       },
    189       "prompts_provided": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No prompt text is provided in the paper or appendix. Algorithm 1 describes the routing procedure but not the actual prompts sent to models."
    193       },
    194       "hyperparameters_reported": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Temperature 0 stated for deterministic evaluation (Section 4.2). N=3 probe samples specified. Retrieval similarity threshold of 0.0 stated for ACAR-UJ."
    198       },
    199       "scaffolding_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Algorithm 1 provides the complete routing procedure. Section 3.1 describes TEAMLLM substrate with deterministic execution, immutable artifacts, and state machine. Section 3.2 details σ-based routing."
    203       },
    204       "data_preprocessing_documented": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No description of how 1,510 tasks were selected from the four benchmarks. No sampling methodology for the 1,000 SuperGPQA tasks or 60 MathArena tasks."
    208       }
    209     },
    210     "limitations_and_scope": {
    211       "limitations_section_present": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Section 8 'Limitations' lists four specific limitations: model set, benchmark bias, no learned routing, and code equivalence."
    215       },
    216       "threats_to_validity_specific": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Limitations are specific: 'SuperGPQA dominates (66% of tasks)', 'LiveCodeBench escalation is inflated by syntactically different but semantically equivalent outputs', three-model limitation."
    220       },
    221       "scope_boundaries_stated": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "Section 8 states what results do not show: does not generalize to open-source models, learned routers may outperform on specific distributions."
    225       }
    226     },
    227     "data_integrity": {
    228       "raw_data_available": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Appendix B lists complete artifact manifest with runs.jsonl containing per-task decision traces for all configurations. 7,550+ auditable runs released."
    232       },
    233       "data_collection_described": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "Section 4.1 describes the four benchmarks, their task counts, and types. Section 3.1 describes TEAMLLM execution substrate."
    237       },
    238       "recruitment_methods_described": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants. Data comes from standard benchmarks."
    242       },
    243       "data_pipeline_documented": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "Section 3.1 describes the TEAMLLM pipeline: PENDING → EXECUTING → VERIFYING → COMPLETED. Algorithm 1 shows the complete procedure. Appendix A confirms zero parse errors."
    247       }
    248     },
    249     "conflicts_of_interest": {
    250       "funding_disclosed": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "No funding source or acknowledgments section mentioning grants or sponsors."
    254       },
    255       "affiliations_disclosed": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "Author listed as 'Ramchand Kumaresan' with no institutional affiliation provided."
    259       },
    260       "funder_independent_of_outcome": {
    261         "applies": true,
    262         "answer": false,
    263         "justification": "No funding information disclosed, so independence cannot be assessed."
    264       },
    265       "financial_interests_declared": {
    266         "applies": true,
    267         "answer": false,
    268         "justification": "No competing interests statement or financial disclosure present in the paper."
    269       }
    270     },
    271     "contamination": {
    272       "training_cutoff_stated": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No training data cutoff dates stated for any of the three models used."
    276       },
    277       "train_test_overlap_discussed": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No discussion of whether benchmark tasks may have appeared in model training data."
    281       },
    282       "benchmark_contamination_addressed": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No contamination analysis for any of the four benchmarks. LiveCodeBench uses temporal splits which helps but this is not discussed in the context of contamination."
    286       }
    287     },
    288     "human_studies": {
    289       "pre_registered": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "No human participants in this study."
    293       },
    294       "irb_or_ethics_approval": {
    295         "applies": false,
    296         "answer": false,
    297         "justification": "No human participants in this study."
    298       },
    299       "demographics_reported": {
    300         "applies": false,
    301         "answer": false,
    302         "justification": "No human participants in this study."
    303       },
    304       "inclusion_exclusion_criteria": {
    305         "applies": false,
    306         "answer": false,
    307         "justification": "No human participants in this study."
    308       },
    309       "randomization_described": {
    310         "applies": false,
    311         "answer": false,
    312         "justification": "No human participants in this study."
    313       },
    314       "blinding_described": {
    315         "applies": false,
    316         "answer": false,
    317         "justification": "No human participants in this study."
    318       },
    319       "attrition_reported": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No human participants in this study."
    323       }
    324     },
    325     "cost_and_practicality": {
    326       "inference_cost_reported": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "Table 1 reports total cost in USD for each configuration: Single-Model $17.04, Arena-2 $20.64, ACAR-U $20.34, Arena-3 $20.64."
    330       },
    331       "compute_budget_stated": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "Total API costs stated per configuration. 7,550+ total runs documented. Latency reported in Figure 7."
    335       }
    336     },
    337     "experimental_rigor": {
    338       "seed_sensitivity_reported": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No multi-seed analysis. Temperature 0 used for determinism but no seed sensitivity study."
    342       },
    343       "number_of_runs_stated": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "7,550+ total runs explicitly stated. 1,510 tasks per configuration. N=3 probe samples per task."
    347       },
    348       "hyperparameter_search_budget": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No hyperparameter search reported. N=3, σ thresholds, and retrieval similarity threshold of 0.0 appear chosen without systematic search."
    352       },
    353       "best_config_selection_justified": {
    354         "applies": true,
    355         "answer": true,
    356         "justification": "All configurations reported and compared in Table 1. Both positive and negative results shown. No cherry-picking."
    357       },
    358       "multiple_comparison_correction": {
    359         "applies": false,
    360         "answer": false,
    361         "justification": "No statistical tests performed, so multiple comparison correction is not applicable."
    362       },
    363       "self_comparison_bias_addressed": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "Authors evaluate their own ACAR system against baselines they implemented. No acknowledgment of self-comparison bias."
    367       },
    368       "compute_budget_vs_performance": {
    369         "applies": true,
    370         "answer": true,
    371         "justification": "Figure 4 shows cost vs accuracy Pareto frontier. Table 1 reports both accuracy and cost for all configurations."
    372       },
    373       "benchmark_construct_validity": {
    374         "applies": true,
    375         "answer": false,
    376         "justification": "No discussion of whether the four benchmarks actually measure the capabilities ACAR claims to route for."
    377       },
    378       "scaffold_confound_addressed": {
    379         "applies": true,
    380         "answer": true,
    381         "justification": "The paper explicitly addresses scaffolding as a variable: ACAR-U vs ACAR-UJ ablates the retrieval scaffold, and the routing mechanism itself is the scaffold being studied. The paper does not compare models across different scaffolds and attribute differences to models — it evaluates the scaffold (routing) as the primary variable."
    382       }
    383     },
    384     "data_leakage": {
    385       "temporal_leakage_addressed": {
    386         "applies": true,
    387         "answer": false,
    388         "justification": "No discussion of whether benchmark problems existed before models' training cutoffs."
    389       },
    390       "feature_leakage_addressed": {
    391         "applies": true,
    392         "answer": false,
    393         "justification": "No discussion of whether the evaluation setup leaks information through routing or probe context."
    394       },
    395       "non_independence_addressed": {
    396         "applies": true,
    397         "answer": false,
    398         "justification": "No discussion of train/test independence across the four benchmarks."
    399       },
    400       "leakage_detection_method": {
    401         "applies": true,
    402         "answer": false,
    403         "justification": "No leakage detection or prevention method applied."
    404       }
    405     }
    406   },
    407   "cited_papers": [
    408     {
    409       "title": "RouterBench: A benchmark for multi-LLM routing system",
    410       "authors": [
    411         "Qitian Jason Hu",
    412         "Jacob Bieker",
    413         "Xiuyu Li",
    414         "Nan Jiang",
    415         "Benjamin Keigwin",
    416         "Gaurav Ranganath",
    417         "Kurt Keutzer",
    418         "Shriyash Kaustubh Upadhyay"
    419       ],
    420       "year": 2024,
    421       "arxiv_id": "2403.12031",
    422       "relevance": "Benchmark for evaluating LLM routing systems, directly comparable to ACAR's routing approach."
    423     },
    424     {
    425       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    426       "authors": [
    427         "Lingjiao Chen",
    428         "Matei Zaharia",
    429         "James Zou"
    430       ],
    431       "year": 2023,
    432       "arxiv_id": "2305.05176",
    433       "relevance": "Cascading routing strategy for cost-efficient LLM deployment, a key baseline approach for multi-model orchestration."
    434     },
    435     {
    436       "title": "RouteLLM: Learning to route LLMs with preference data",
    437       "authors": [
    438         "Isaac Ong",
    439         "Amjad Almahairi",
    440         "Vincent Wu",
    441         "Wei-Lin Chiang",
    442         "Tianhao Wu",
    443         "Joseph E. Gonzalez",
    444         "M Waleed Kadous",
    445         "Ion Stoica"
    446       ],
    447       "year": 2025,
    448       "relevance": "Learned router using preference data, representing state-of-the-art in LLM routing that ACAR deliberately avoids."
    449     },
    450     {
    451       "title": "ReAct: Synergizing reasoning and acting in language models",
    452       "authors": [
    453         "Shunyu Yao",
    454         "Jeffrey Zhao",
    455         "Dian Yu",
    456         "Nan Du",
    457         "Izhak Shafran",
    458         "Karthik Narasimhan",
    459         "Yuan Cao"
    460       ],
    461       "year": 2023,
    462       "relevance": "Foundational work on tool-augmented LLM agents, relevant to multi-model coordination approaches."
    463     },
    464     {
    465       "title": "A survey on mixture of experts in large language models",
    466       "authors": [
    467         "Weilin Cai",
    468         "Juyong Jiang",
    469         "Fan Wang",
    470         "Jing Tang",
    471         "Sunghun Kim",
    472         "Jiayi Huang"
    473       ],
    474       "year": 2024,
    475       "arxiv_id": "2407.06204",
    476       "relevance": "Surveys intra-model routing via MoE, contrasting with ACAR's inter-model routing approach."
    477     },
    478     {
    479       "title": "The Shapley value in machine learning",
    480       "authors": [
    481         "Benedek Rozemberczki",
    482         "Lauren Watson",
    483         "Péter Bayer"
    484       ],
    485       "year": 2022,
    486       "relevance": "Attribution methodology for ML model contributions, relevant to ACAR's failed attribution proxy experiments."
    487     }
    488   ],
    489   "engagement_factors": {
    490     "practical_relevance": {
    491       "score": 1,
    492       "justification": "The routing concept is potentially useful but the specific implementation is tightly coupled to a custom substrate and the accuracy gains are marginal (1.2pp)."
    493     },
    494     "surprise_contrarian": {
    495       "score": 2,
    496       "justification": "The finding that retrieval augmentation consistently hurts performance (-3.4pp) is counterintuitive and challenges the 'more context is better' assumption prevalent in RAG discourse."
    497     },
    498     "fear_safety": {
    499       "score": 0,
    500       "justification": "No safety, security, or risk angle is present in the paper."
    501     },
    502     "drama_conflict": {
    503       "score": 0,
    504       "justification": "No controversy, company criticism, or replication failure — the paper is a straightforward technical evaluation."
    505     },
    506     "demo_ability": {
    507       "score": 1,
    508       "justification": "Code and artifacts are released on GitHub but require significant setup with three paid API providers to reproduce."
    509     },
    510     "brand_recognition": {
    511       "score": 0,
    512       "justification": "Single unknown author, no venue, no institutional affiliation listed."
    513     }
    514   }
    515 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs