scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (29597B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "ACAR: Adaptive Complexity Routing for Multi-Model Ensembles with Auditable Decision Traces",
      6     "authors": [
      7       "Ramchand Kumaresan"
      8     ],
      9     "year": 2026,
     10     "venue": "arXiv",
     11     "arxiv_id": "2602.21231",
     12     "doi": null
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Abstract claims (55.6% accuracy, -3.4pp retrieval, 8pp gap, weak attribution correlation) are supported by results. The abstract explicitly states what does not hold.",
     20         "source": "opus"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Causal claims like 'retrieval augmentation decreased accuracy' are supported by controlled comparison (ACAR-U vs ACAR-UJ, same setup minus retrieval). The ablation design is adequate.",
     26         "source": "opus"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Section 8 explicitly bounds scope: 'Three models from major providers; may not generalize to open-source models.' SuperGPQA dominance acknowledged. Paper frames itself as a measurement framework.",
     32         "source": "opus"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Section 6 discusses why retrieval fails (low similarity, not inherent limitation), why accuracy ceiling exists (intrinsic to self-consistency), and Section 8 notes code equivalence inflating LiveCodeBench escalation.",
     38         "source": "opus"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "The paper measures accuracy on four benchmarks and frames results precisely at that granularity: '55.6% accuracy' on specific tasks. It explicitly discusses what accuracy does NOT capture (Section 8 limitations). The paper frames itself as 'a measurement framework' rather than claiming broader routing effectiveness.",
     44         "source": "opus"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Section 8 'Limitations' lists four specific limitations: model set, benchmark bias, no learned routing, and code equivalence.",
     52         "source": "opus"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Limitations are specific: 'SuperGPQA dominates (66% of tasks)', 'LiveCodeBench escalation is inflated by syntactically different but semantically equivalent outputs', three-model limitation.",
     58         "source": "opus"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 8 states what results do not show: does not generalize to open-source models, learned routers may outperform on specific distributions.",
     64         "source": "opus"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding source or acknowledgments section mentioning grants or sponsors.",
     72         "source": "opus"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "Author listed as 'Ramchand Kumaresan' with no institutional affiliation provided.",
     78         "source": "opus"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No funding information disclosed, so independence cannot be assessed.",
     84         "source": "opus"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement or financial disclosure present in the paper.",
     90         "source": "opus"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Key terms formally defined: self-consistency variance σ (Definition 1 with formula), execution mode (Definition 2), TEAMLLM infrastructure (section 3.1), auditable decision traces. Most key terms have precise definitions; 'retrieval augmentation' explained but not formally defined until method section.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 1.2 lists three explicit contributions: (1) ACAR routing mechanism (55.6% accuracy), (2) negative result on retrieval, (3) TEAMLLM infrastructure release. Abstract frames this as a 'measurement framework for studying multi-model orchestration.' Clear value propositions stated.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 2 systematically engages prior work across three areas (routing, cost-aware inference, reproducible benchmarking). For each area, paper explains how ACAR differs: 'ACAR differs in three ways: (1) we use self-consistency rather than learned classifiers... (2) we log complete decision traces... (3) we explicitly measure and report failure modes.' Substantive engagement beyond listing.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": true,
    120           "justification": "GitHub repository URL provided: https://github.com/mechramc/ACAR-TeamLLM. Paper states 'Code and artifacts are publicly available' (Section 1.2 footnote).",
    121           "source": "opus"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "Appendix B lists complete artifact manifest including runs.jsonl for all configurations (ACAR-U, ACAR-UJ, Arena-3, Arena-2, single-model). The benchmarks used are public (MathArena, Reasoning Gym, LiveCodeBench, SuperGPQA).",
    127           "source": "opus"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "No requirements.txt, Dockerfile, conda environment, or library versions mentioned. Appendix A mentions 'environment fingerprint' in logs but does not describe the actual environment.",
    133           "source": "opus"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": true,
    138           "justification": "Appendix A states 'All figures regenerable from released artifacts' with figure regeneration scripts. Appendix B provides the artifact directory structure. 208 unit tests for infrastructure validation are mentioned.",
    139           "source": "opus"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "All results are point estimates. Table 1 shows raw accuracy percentages (e.g., 55.6%) with no confidence intervals or error bars.",
    147           "source": "opus"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "The paper claims ACAR-U 'exceeds' Arena-2 (55.6% vs 54.4%) without any significance test. No p-values, bootstrap tests, or other statistical tests are reported.",
    153           "source": "opus"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Effect sizes are reported with baseline context throughout: '1.2 percentage points' improvement over Arena-2, '-3.4pp' for retrieval, '8pp gap' to Arena-3. Table 2 provides per-benchmark deltas.",
    159           "source": "opus"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "No justification for the 1,510 task count or the per-benchmark counts (60, 250, 200, 1000). No power analysis or discussion of whether 60 MathArena tasks is sufficient.",
    165           "source": "opus"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Results are from single experimental runs. No variance, standard deviation, or spread measures across runs are reported.",
    171           "source": "opus"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Three baselines: Single-Model (best single model), Arena-2 (two-model ensemble), Arena-3 (three-model ensemble). Table 1 compares all configurations.",
    179           "source": "opus"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "No comparison against learned routing systems (RouterBench, FrugalGPT, RouteLLM) discussed in related work. The baselines are naive fixed-ensemble strategies, not state-of-the-art routing methods.",
    185           "source": "opus"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "ACAR-U vs ACAR-UJ is an ablation of the retrieval component. Per-benchmark breakdowns and escalation analysis further isolate component contributions.",
    191           "source": "opus"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Reports accuracy, cost (USD), escalation rate, and latency (ms) across configurations.",
    197           "source": "opus"
    198         },
    199         "human_evaluation": {
    200           "applies": false,
    201           "answer": false,
    202           "justification": "The paper evaluates automated benchmark performance with execution-verified answers. Human evaluation is not relevant to the claims about routing accuracy and cost.",
    203           "source": "opus"
    204         },
    205         "held_out_test_set": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "The benchmarks are external test sets not used for any tuning. ACAR uses no learned parameters that could overfit.",
    209           "source": "opus"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Figure 3 and Table 2 provide per-benchmark breakdowns. Figure 5 shows escalation distribution by benchmark.",
    215           "source": "opus"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Section 6 is entirely dedicated to failure modes: retrieval hurting performance (6.1), agreement-but-wrong (6.2), and attribution proxy failure (6.3).",
    221           "source": "opus"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Three significant negative results are prominently reported in Section 6. This is a notable strength of the paper.",
    227           "source": "opus"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": false,
    234           "justification": "Models listed as 'Claude Sonnet 4', 'GPT-4o', 'Gemini 2.0 Flash' without specific API versions or snapshot dates.",
    235           "source": "opus"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": false,
    240           "justification": "No prompt text is provided in the paper or appendix. Algorithm 1 describes the routing procedure but not the actual prompts sent to models.",
    241           "source": "opus"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Temperature 0 stated for deterministic evaluation (Section 4.2). N=3 probe samples specified. Retrieval similarity threshold of 0.0 stated for ACAR-UJ.",
    247           "source": "opus"
    248         },
    249         "scaffolding_described": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "Algorithm 1 provides the complete routing procedure. Section 3.1 describes TEAMLLM substrate with deterministic execution, immutable artifacts, and state machine. Section 3.2 details σ-based routing.",
    253           "source": "opus"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": true,
    257           "answer": false,
    258           "justification": "No description of how 1,510 tasks were selected from the four benchmarks. No sampling methodology for the 1,000 SuperGPQA tasks or 60 MathArena tasks.",
    259           "source": "opus"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "Appendix B lists complete artifact manifest with runs.jsonl containing per-task decision traces for all configurations. 7,550+ auditable runs released.",
    267           "source": "opus"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Section 4.1 describes the four benchmarks, their task counts, and types. Section 3.1 describes TEAMLLM execution substrate.",
    273           "source": "opus"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": false,
    277           "answer": false,
    278           "justification": "No human participants. Data comes from standard benchmarks.",
    279           "source": "opus"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Section 3.1 describes the TEAMLLM pipeline: PENDING → EXECUTING → VERIFYING → COMPLETED. Algorithm 1 shows the complete procedure. Appendix A confirms zero parse errors.",
    285           "source": "opus"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": true,
    291           "answer": false,
    292           "justification": "No training data cutoff dates stated for any of the three models used.",
    293           "source": "opus"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "No discussion of whether benchmark tasks may have appeared in model training data.",
    299           "source": "opus"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No contamination analysis for any of the four benchmarks. LiveCodeBench uses temporal splits which helps but this is not discussed in the context of contamination.",
    305           "source": "opus"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "No human participants in this study.",
    313           "source": "opus"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants in this study.",
    319           "source": "opus"
    320         },
    321         "demographics_reported": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants in this study.",
    325           "source": "opus"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study.",
    331           "source": "opus"
    332         },
    333         "randomization_described": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "opus"
    338         },
    339         "blinding_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "opus"
    344         },
    345         "attrition_reported": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "opus"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": true,
    356           "justification": "Table 1 reports total cost in USD for each configuration: Single-Model $17.04, Arena-2 $20.64, ACAR-U $20.34, Arena-3 $20.64.",
    357           "source": "opus"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": true,
    362           "justification": "Total API costs stated per configuration. 7,550+ total runs documented. Latency reported in Figure 7.",
    363           "source": "opus"
    364         }
    365       },
    366       "experimental_rigor": {
    367         "seed_sensitivity_reported": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "No multi-seed analysis. Temperature 0 used for determinism but no seed sensitivity study.",
    371           "source": "opus"
    372         },
    373         "number_of_runs_stated": {
    374           "applies": true,
    375           "answer": true,
    376           "justification": "7,550+ total runs explicitly stated. 1,510 tasks per configuration. N=3 probe samples per task.",
    377           "source": "opus"
    378         },
    379         "hyperparameter_search_budget": {
    380           "applies": true,
    381           "answer": false,
    382           "justification": "No hyperparameter search reported. N=3, σ thresholds, and retrieval similarity threshold of 0.0 appear chosen without systematic search.",
    383           "source": "opus"
    384         },
    385         "best_config_selection_justified": {
    386           "applies": true,
    387           "answer": true,
    388           "justification": "All configurations reported and compared in Table 1. Both positive and negative results shown. No cherry-picking.",
    389           "source": "opus"
    390         },
    391         "multiple_comparison_correction": {
    392           "applies": false,
    393           "answer": false,
    394           "justification": "No statistical tests performed, so multiple comparison correction is not applicable.",
    395           "source": "opus"
    396         },
    397         "self_comparison_bias_addressed": {
    398           "applies": true,
    399           "answer": false,
    400           "justification": "Authors evaluate their own ACAR system against baselines they implemented. No acknowledgment of self-comparison bias.",
    401           "source": "opus"
    402         },
    403         "compute_budget_vs_performance": {
    404           "applies": true,
    405           "answer": true,
    406           "justification": "Figure 4 shows cost vs accuracy Pareto frontier. Table 1 reports both accuracy and cost for all configurations.",
    407           "source": "opus"
    408         },
    409         "benchmark_construct_validity": {
    410           "applies": true,
    411           "answer": false,
    412           "justification": "No discussion of whether the four benchmarks actually measure the capabilities ACAR claims to route for.",
    413           "source": "opus"
    414         },
    415         "scaffold_confound_addressed": {
    416           "applies": true,
    417           "answer": true,
    418           "justification": "The paper explicitly addresses scaffolding as a variable: ACAR-U vs ACAR-UJ ablates the retrieval scaffold, and the routing mechanism itself is the scaffold being studied. The paper does not compare models across different scaffolds and attribute differences to models — it evaluates the scaffold (routing) as the primary variable.",
    419           "source": "opus"
    420         }
    421       },
    422       "data_leakage": {
    423         "temporal_leakage_addressed": {
    424           "applies": true,
    425           "answer": false,
    426           "justification": "No discussion of whether benchmark problems existed before models' training cutoffs.",
    427           "source": "opus"
    428         },
    429         "feature_leakage_addressed": {
    430           "applies": true,
    431           "answer": false,
    432           "justification": "No discussion of whether the evaluation setup leaks information through routing or probe context.",
    433           "source": "opus"
    434         },
    435         "non_independence_addressed": {
    436           "applies": true,
    437           "answer": false,
    438           "justification": "No discussion of train/test independence across the four benchmarks.",
    439           "source": "opus"
    440         },
    441         "leakage_detection_method": {
    442           "applies": true,
    443           "answer": false,
    444           "justification": "No leakage detection or prevention method applied.",
    445           "source": "opus"
    446         }
    447       }
    448     }
    449   },
    450   "claims": [
    451     {
    452       "claim": "σ-based routing achieves 55.6% accuracy, exceeding the two-model baseline (54.4%)",
    453       "evidence": "Table 1 shows ACAR-U at 55.6% (839/1510 correct) vs Arena-2 at 54.4% (822/1510 correct)",
    454       "supported": "strong"
    455     },
    456     {
    457       "claim": "Adaptive routing avoids full ensembling on 54.2% of tasks",
    458       "evidence": "Figure 6 shows cumulative full-arena usage reaching 45.8% by final task, implying 54.2% routed to single-agent or lite modes",
    459       "supported": "strong"
    460     },
    461     {
    462       "claim": "Retrieval augmentation with low-quality stores decreases accuracy by 3.4 percentage points overall",
    463       "evidence": "Table 2 shows ACAR-U 55.6% vs ACAR-UJ 52.4%, difference of 3.2pp (paper claims 3.4pp—minor discrepancy)",
    464       "supported": "strong"
    465     },
    466     {
    467       "claim": "Agreement-but-wrong failure mode bounds achievable accuracy at 8pp below full ensembling",
    468       "evidence": "Table 1 shows ACAR-U 55.6% vs Arena-3 ceiling 63.6%, gap of 8.0pp. Section 6.2 explains when all probe samples agree incorrectly, no ensemble recovery possible",
    469       "supported": "strong"
    470     },
    471     {
    472       "claim": "Retrieval utility requires semantic similarity threshold >0.7; median retrieved experiences had only 0.167 similarity",
    473       "evidence": "Figure 9 shows median similarity 0.167 (p90: 0.833) for 837 retrieved experiences. Section 6.1 attributes performance loss to low-quality matches",
    474       "supported": "strong"
    475     },
    476     {
    477       "claim": "Attribution proxies (response similarity, entropy) show weak correlation with ground-truth leave-one-out values",
    478       "evidence": "Section 6.3 reports attribution experiment but provides no quantified correlation coefficients. Claims proxy signals 'showed weak correlation' without numerical support",
    479       "supported": "moderate"
    480     },
    481     {
    482       "claim": "Self-consistency variance (σ) routing is model-agnostic and requires no learned components",
    483       "evidence": "Algorithm 1 shows purely heuristic routing based on answer agreement counts. No training or model-specific tuning required",
    484       "supported": "strong"
    485     },
    486     {
    487       "claim": "TEAMLLM provides deterministic execution with immutable artifacts enabling reproducible multi-model research",
    488       "evidence": "Section 3.1 specifies three invariants: deterministic execution with logged seeds, immutable append-only artifacts, forward-only state machine. 7,550+ runs logged",
    489       "supported": "strong"
    490     }
    491   ],
    492   "methodology_tags": [
    493     "benchmark-eval"
    494   ],
    495   "key_findings": "ACAR achieves 55.6% accuracy on 1,510 multi-benchmark tasks by adaptively routing 54.2% to single-model execution based on self-consistency variance, exceeding two-model baselines while costing less. Three systematic failures are documented: retrieval augmentation hurts accuracy by 3.4pp unless similarity threshold >0.7 is enforced, models' unanimous incorrect agreement (σ=0) cannot be recovered by ensembling and bounds maximum achievable accuracy 8pp below full ensemble, and post-hoc attribution proxies (similarity, entropy) do not correlate with ground-truth leave-one-out contributions. The work prioritizes auditability and negative-result reporting over routing accuracy optimization.",
    496   "red_flags": [
    497     {
    498       "flag": "No statistical significance testing",
    499       "detail": "1.2pp improvement (55.6% vs 54.4%) not tested for significance. With 1,510 tasks, confidence intervals and p-values could be computed but are not."
    500     },
    501     {
    502       "flag": "No confidence intervals or uncertainty quantification",
    503       "detail": "All results reported as point estimates. No error bars, bootstrap CIs, or variance measures despite evaluating 1,510 tasks."
    504     },
    505     {
    506       "flag": "Model versions lack snapshot dates",
    507       "detail": "Models identified as 'Claude Sonnet 4,' 'GPT-4o,' 'Gemini 2.0 Flash' without version checkpoints or training cutoff dates. Reproducibility limited without exact model versions."
    508     },
    509     {
    510       "flag": "Prompts not provided",
    511       "detail": "Paper references 'prompt template hash' but does not provide actual system prompts or task prompts. Reproducibility requires prompt access."
    512     },
    513     {
    514       "flag": "Contamination not addressed",
    515       "detail": "No discussion of whether benchmark tasks overlap with model training data. No explicit verification of test set cleanliness relative to model cutoffs."
    516     },
    517     {
    518       "flag": "Sample size not justified",
    519       "detail": "Evaluation uses 1,510 tasks but provides no justification. No power analysis or sample size calculation discussed."
    520     },
    521     {
    522       "flag": "Attribution analysis incomplete",
    523       "detail": "Section 6.3 claims proxy signals 'showed weak correlation' with ground truth but provides no correlation coefficients, r-values, or statistical measures."
    524     },
    525     {
    526       "flag": "Benchmark composition severely imbalanced",
    527       "detail": "SuperGPQA comprises 1,000/1,510 tasks (66%). Results may overweight multiple-choice QA performance."
    528     },
    529     {
    530       "flag": "No funding or affiliation disclosure",
    531       "detail": "Author listed without institutional affiliation. No funding source stated. Standard disclosure statements absent."
    532     },
    533     {
    534       "flag": "Answer extraction method not specified",
    535       "detail": "EXTRACT(ri) function referenced in Algorithm 1 but not defined. How answers are canonicalized from model responses unclear."
    536     }
    537   ],
    538   "cited_papers": [
    539     {
    540       "title": "RouterBench: A benchmark for multi-LLM routing system",
    541       "authors": "Hu et al.",
    542       "year": 2024,
    543       "relevance": "Establishes benchmark for evaluating LLM routing systems; ACAR contrasts with learned routing approaches"
    544     },
    545     {
    546       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    547       "authors": "Chen, Zaharia, Zou",
    548       "year": 2023,
    549       "relevance": "Cost-aware inference strategy; ACAR addresses similar cost-quality tradeoff but via self-consistency instead of learned cascades"
    550     },
    551     {
    552       "title": "RouteLLM: Learning to route LLMs with preference data",
    553       "authors": "Ong et al.",
    554       "year": 2025,
    555       "relevance": "Learning-based routing; ACAR explicitly rejects learned routers in favor of heuristic interpretability"
    556     },
    557     {
    558       "title": "ReAct: Synergizing reasoning and acting in language models",
    559       "authors": "Yao et al.",
    560       "year": 2023,
    561       "relevance": "Agentic reasoning with tools; tangential to multi-model routing but relevant for task complexity estimation"
    562     },
    563     {
    564       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models on Code",
    565       "authors": "[benchmark paper]",
    566       "year": 2024,
    567       "relevance": "Execution-verified code benchmark with temporal splits; used as evaluation dataset and exemplar of rigorous benchmark design"
    568     },
    569     {
    570       "title": "SuperGPQA: A Benchmark for LLM Knowledge Breadth",
    571       "authors": "[benchmark paper]",
    572       "year": 2024,
    573       "relevance": "Large-scale multiple-choice QA benchmark; comprises 66% of evaluation tasks in this work"
    574     }
    575   ],
    576   "engagement_factors": {
    577     "practical_relevance": {
    578       "score": 1,
    579       "justification": "The routing concept is potentially useful but the specific implementation is tightly coupled to a custom substrate and the accuracy gains are marginal (1.2pp)."
    580     },
    581     "surprise_contrarian": {
    582       "score": 2,
    583       "justification": "The finding that retrieval augmentation consistently hurts performance (-3.4pp) is counterintuitive and challenges the 'more context is better' assumption prevalent in RAG discourse."
    584     },
    585     "fear_safety": {
    586       "score": 0,
    587       "justification": "No safety, security, or risk angle is present in the paper."
    588     },
    589     "drama_conflict": {
    590       "score": 0,
    591       "justification": "No controversy, company criticism, or replication failure — the paper is a straightforward technical evaluation."
    592     },
    593     "demo_ability": {
    594       "score": 1,
    595       "justification": "Code and artifacts are released on GitHub but require significant setup with three paid API providers to reproduce."
    596     },
    597     "brand_recognition": {
    598       "score": 0,
    599       "justification": "Single unknown author, no venue, no institutional affiliation listed."
    600     }
    601   },
    602   "hn_data": {
    603     "threads": [
    604       {
    605         "hn_id": "47154950",
    606         "title": "Aletheia Tackles FirstProof Autonomously",
    607         "points": 5,
    608         "comments": 0,
    609         "url": "https://news.ycombinator.com/item?id=47154950",
    610         "created_at": "2026-02-25T17:46:36Z"
    611       },
    612       {
    613         "hn_id": "47314080",
    614         "title": "Latent Context Compilation: Distilling Long Context into Compact Portable Memory",
    615         "points": 2,
    616         "comments": 0,
    617         "url": "https://news.ycombinator.com/item?id=47314080",
    618         "created_at": "2026-03-09T19:21:30Z"
    619       }
    620     ],
    621     "top_points": 5,
    622     "total_points": 7,
    623     "total_comments": 0
    624   }
    625 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs