scan-v4.json (29597B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "ACAR: Adaptive Complexity Routing for Multi-Model Ensembles with Auditable Decision Traces", 6 "authors": [ 7 "Ramchand Kumaresan" 8 ], 9 "year": 2026, 10 "venue": "arXiv", 11 "arxiv_id": "2602.21231", 12 "doi": null 13 }, 14 "checklist": { 15 "claims_and_evidence": { 16 "abstract_claims_supported": { 17 "applies": true, 18 "answer": true, 19 "justification": "Abstract claims (55.6% accuracy, -3.4pp retrieval, 8pp gap, weak attribution correlation) are supported by results. The abstract explicitly states what does not hold.", 20 "source": "opus" 21 }, 22 "causal_claims_justified": { 23 "applies": true, 24 "answer": true, 25 "justification": "Causal claims like 'retrieval augmentation decreased accuracy' are supported by controlled comparison (ACAR-U vs ACAR-UJ, same setup minus retrieval). The ablation design is adequate.", 26 "source": "opus" 27 }, 28 "generalization_bounded": { 29 "applies": true, 30 "answer": true, 31 "justification": "Section 8 explicitly bounds scope: 'Three models from major providers; may not generalize to open-source models.' SuperGPQA dominance acknowledged. Paper frames itself as a measurement framework.", 32 "source": "opus" 33 }, 34 "alternative_explanations_discussed": { 35 "applies": true, 36 "answer": true, 37 "justification": "Section 6 discusses why retrieval fails (low similarity, not inherent limitation), why accuracy ceiling exists (intrinsic to self-consistency), and Section 8 notes code equivalence inflating LiveCodeBench escalation.", 38 "source": "opus" 39 }, 40 "proxy_outcome_distinction": { 41 "applies": true, 42 "answer": true, 43 "justification": "The paper measures accuracy on four benchmarks and frames results precisely at that granularity: '55.6% accuracy' on specific tasks. It explicitly discusses what accuracy does NOT capture (Section 8 limitations). The paper frames itself as 'a measurement framework' rather than claiming broader routing effectiveness.", 44 "source": "opus" 45 } 46 }, 47 "limitations_and_scope": { 48 "limitations_section_present": { 49 "applies": true, 50 "answer": true, 51 "justification": "Section 8 'Limitations' lists four specific limitations: model set, benchmark bias, no learned routing, and code equivalence.", 52 "source": "opus" 53 }, 54 "threats_to_validity_specific": { 55 "applies": true, 56 "answer": true, 57 "justification": "Limitations are specific: 'SuperGPQA dominates (66% of tasks)', 'LiveCodeBench escalation is inflated by syntactically different but semantically equivalent outputs', three-model limitation.", 58 "source": "opus" 59 }, 60 "scope_boundaries_stated": { 61 "applies": true, 62 "answer": true, 63 "justification": "Section 8 states what results do not show: does not generalize to open-source models, learned routers may outperform on specific distributions.", 64 "source": "opus" 65 } 66 }, 67 "conflicts_of_interest": { 68 "funding_disclosed": { 69 "applies": true, 70 "answer": false, 71 "justification": "No funding source or acknowledgments section mentioning grants or sponsors.", 72 "source": "opus" 73 }, 74 "affiliations_disclosed": { 75 "applies": true, 76 "answer": false, 77 "justification": "Author listed as 'Ramchand Kumaresan' with no institutional affiliation provided.", 78 "source": "opus" 79 }, 80 "funder_independent_of_outcome": { 81 "applies": true, 82 "answer": false, 83 "justification": "No funding information disclosed, so independence cannot be assessed.", 84 "source": "opus" 85 }, 86 "financial_interests_declared": { 87 "applies": true, 88 "answer": false, 89 "justification": "No competing interests statement or financial disclosure present in the paper.", 90 "source": "opus" 91 } 92 }, 93 "scope_and_framing": { 94 "key_terms_defined": { 95 "applies": true, 96 "answer": true, 97 "justification": "Key terms formally defined: self-consistency variance σ (Definition 1 with formula), execution mode (Definition 2), TEAMLLM infrastructure (section 3.1), auditable decision traces. Most key terms have precise definitions; 'retrieval augmentation' explained but not formally defined until method section.", 98 "source": "haiku" 99 }, 100 "intended_contribution_clear": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 1.2 lists three explicit contributions: (1) ACAR routing mechanism (55.6% accuracy), (2) negative result on retrieval, (3) TEAMLLM infrastructure release. Abstract frames this as a 'measurement framework for studying multi-model orchestration.' Clear value propositions stated.", 104 "source": "haiku" 105 }, 106 "engagement_with_prior_work": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 2 systematically engages prior work across three areas (routing, cost-aware inference, reproducible benchmarking). For each area, paper explains how ACAR differs: 'ACAR differs in three ways: (1) we use self-consistency rather than learned classifiers... (2) we log complete decision traces... (3) we explicitly measure and report failure modes.' Substantive engagement beyond listing.", 110 "source": "haiku" 111 } 112 } 113 }, 114 "type_checklist": { 115 "empirical": { 116 "artifacts": { 117 "code_released": { 118 "applies": true, 119 "answer": true, 120 "justification": "GitHub repository URL provided: https://github.com/mechramc/ACAR-TeamLLM. Paper states 'Code and artifacts are publicly available' (Section 1.2 footnote).", 121 "source": "opus" 122 }, 123 "data_released": { 124 "applies": true, 125 "answer": true, 126 "justification": "Appendix B lists complete artifact manifest including runs.jsonl for all configurations (ACAR-U, ACAR-UJ, Arena-3, Arena-2, single-model). The benchmarks used are public (MathArena, Reasoning Gym, LiveCodeBench, SuperGPQA).", 127 "source": "opus" 128 }, 129 "environment_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "No requirements.txt, Dockerfile, conda environment, or library versions mentioned. Appendix A mentions 'environment fingerprint' in logs but does not describe the actual environment.", 133 "source": "opus" 134 }, 135 "reproduction_instructions": { 136 "applies": true, 137 "answer": true, 138 "justification": "Appendix A states 'All figures regenerable from released artifacts' with figure regeneration scripts. Appendix B provides the artifact directory structure. 208 unit tests for infrastructure validation are mentioned.", 139 "source": "opus" 140 } 141 }, 142 "statistical_methodology": { 143 "confidence_intervals_or_error_bars": { 144 "applies": true, 145 "answer": false, 146 "justification": "All results are point estimates. Table 1 shows raw accuracy percentages (e.g., 55.6%) with no confidence intervals or error bars.", 147 "source": "opus" 148 }, 149 "significance_tests": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper claims ACAR-U 'exceeds' Arena-2 (55.6% vs 54.4%) without any significance test. No p-values, bootstrap tests, or other statistical tests are reported.", 153 "source": "opus" 154 }, 155 "effect_sizes_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Effect sizes are reported with baseline context throughout: '1.2 percentage points' improvement over Arena-2, '-3.4pp' for retrieval, '8pp gap' to Arena-3. Table 2 provides per-benchmark deltas.", 159 "source": "opus" 160 }, 161 "sample_size_justified": { 162 "applies": true, 163 "answer": false, 164 "justification": "No justification for the 1,510 task count or the per-benchmark counts (60, 250, 200, 1000). No power analysis or discussion of whether 60 MathArena tasks is sufficient.", 165 "source": "opus" 166 }, 167 "variance_reported": { 168 "applies": true, 169 "answer": false, 170 "justification": "Results are from single experimental runs. No variance, standard deviation, or spread measures across runs are reported.", 171 "source": "opus" 172 } 173 }, 174 "evaluation_design": { 175 "baselines_included": { 176 "applies": true, 177 "answer": true, 178 "justification": "Three baselines: Single-Model (best single model), Arena-2 (two-model ensemble), Arena-3 (three-model ensemble). Table 1 compares all configurations.", 179 "source": "opus" 180 }, 181 "baselines_contemporary": { 182 "applies": true, 183 "answer": false, 184 "justification": "No comparison against learned routing systems (RouterBench, FrugalGPT, RouteLLM) discussed in related work. The baselines are naive fixed-ensemble strategies, not state-of-the-art routing methods.", 185 "source": "opus" 186 }, 187 "ablation_study": { 188 "applies": true, 189 "answer": true, 190 "justification": "ACAR-U vs ACAR-UJ is an ablation of the retrieval component. Per-benchmark breakdowns and escalation analysis further isolate component contributions.", 191 "source": "opus" 192 }, 193 "multiple_metrics": { 194 "applies": true, 195 "answer": true, 196 "justification": "Reports accuracy, cost (USD), escalation rate, and latency (ms) across configurations.", 197 "source": "opus" 198 }, 199 "human_evaluation": { 200 "applies": false, 201 "answer": false, 202 "justification": "The paper evaluates automated benchmark performance with execution-verified answers. Human evaluation is not relevant to the claims about routing accuracy and cost.", 203 "source": "opus" 204 }, 205 "held_out_test_set": { 206 "applies": true, 207 "answer": true, 208 "justification": "The benchmarks are external test sets not used for any tuning. ACAR uses no learned parameters that could overfit.", 209 "source": "opus" 210 }, 211 "per_category_breakdown": { 212 "applies": true, 213 "answer": true, 214 "justification": "Figure 3 and Table 2 provide per-benchmark breakdowns. Figure 5 shows escalation distribution by benchmark.", 215 "source": "opus" 216 }, 217 "failure_cases_discussed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Section 6 is entirely dedicated to failure modes: retrieval hurting performance (6.1), agreement-but-wrong (6.2), and attribution proxy failure (6.3).", 221 "source": "opus" 222 }, 223 "negative_results_reported": { 224 "applies": true, 225 "answer": true, 226 "justification": "Three significant negative results are prominently reported in Section 6. This is a notable strength of the paper.", 227 "source": "opus" 228 } 229 }, 230 "setup_transparency": { 231 "model_versions_specified": { 232 "applies": true, 233 "answer": false, 234 "justification": "Models listed as 'Claude Sonnet 4', 'GPT-4o', 'Gemini 2.0 Flash' without specific API versions or snapshot dates.", 235 "source": "opus" 236 }, 237 "prompts_provided": { 238 "applies": true, 239 "answer": false, 240 "justification": "No prompt text is provided in the paper or appendix. Algorithm 1 describes the routing procedure but not the actual prompts sent to models.", 241 "source": "opus" 242 }, 243 "hyperparameters_reported": { 244 "applies": true, 245 "answer": true, 246 "justification": "Temperature 0 stated for deterministic evaluation (Section 4.2). N=3 probe samples specified. Retrieval similarity threshold of 0.0 stated for ACAR-UJ.", 247 "source": "opus" 248 }, 249 "scaffolding_described": { 250 "applies": true, 251 "answer": true, 252 "justification": "Algorithm 1 provides the complete routing procedure. Section 3.1 describes TEAMLLM substrate with deterministic execution, immutable artifacts, and state machine. Section 3.2 details σ-based routing.", 253 "source": "opus" 254 }, 255 "data_preprocessing_documented": { 256 "applies": true, 257 "answer": false, 258 "justification": "No description of how 1,510 tasks were selected from the four benchmarks. No sampling methodology for the 1,000 SuperGPQA tasks or 60 MathArena tasks.", 259 "source": "opus" 260 } 261 }, 262 "data_integrity": { 263 "raw_data_available": { 264 "applies": true, 265 "answer": true, 266 "justification": "Appendix B lists complete artifact manifest with runs.jsonl containing per-task decision traces for all configurations. 7,550+ auditable runs released.", 267 "source": "opus" 268 }, 269 "data_collection_described": { 270 "applies": true, 271 "answer": true, 272 "justification": "Section 4.1 describes the four benchmarks, their task counts, and types. Section 3.1 describes TEAMLLM execution substrate.", 273 "source": "opus" 274 }, 275 "recruitment_methods_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants. Data comes from standard benchmarks.", 279 "source": "opus" 280 }, 281 "data_pipeline_documented": { 282 "applies": true, 283 "answer": true, 284 "justification": "Section 3.1 describes the TEAMLLM pipeline: PENDING → EXECUTING → VERIFYING → COMPLETED. Algorithm 1 shows the complete procedure. Appendix A confirms zero parse errors.", 285 "source": "opus" 286 } 287 }, 288 "contamination": { 289 "training_cutoff_stated": { 290 "applies": true, 291 "answer": false, 292 "justification": "No training data cutoff dates stated for any of the three models used.", 293 "source": "opus" 294 }, 295 "train_test_overlap_discussed": { 296 "applies": true, 297 "answer": false, 298 "justification": "No discussion of whether benchmark tasks may have appeared in model training data.", 299 "source": "opus" 300 }, 301 "benchmark_contamination_addressed": { 302 "applies": true, 303 "answer": false, 304 "justification": "No contamination analysis for any of the four benchmarks. LiveCodeBench uses temporal splits which helps but this is not discussed in the context of contamination.", 305 "source": "opus" 306 } 307 }, 308 "human_studies": { 309 "pre_registered": { 310 "applies": false, 311 "answer": false, 312 "justification": "No human participants in this study.", 313 "source": "opus" 314 }, 315 "irb_or_ethics_approval": { 316 "applies": false, 317 "answer": false, 318 "justification": "No human participants in this study.", 319 "source": "opus" 320 }, 321 "demographics_reported": { 322 "applies": false, 323 "answer": false, 324 "justification": "No human participants in this study.", 325 "source": "opus" 326 }, 327 "inclusion_exclusion_criteria": { 328 "applies": false, 329 "answer": false, 330 "justification": "No human participants in this study.", 331 "source": "opus" 332 }, 333 "randomization_described": { 334 "applies": false, 335 "answer": false, 336 "justification": "No human participants in this study.", 337 "source": "opus" 338 }, 339 "blinding_described": { 340 "applies": false, 341 "answer": false, 342 "justification": "No human participants in this study.", 343 "source": "opus" 344 }, 345 "attrition_reported": { 346 "applies": false, 347 "answer": false, 348 "justification": "No human participants in this study.", 349 "source": "opus" 350 } 351 }, 352 "cost_and_practicality": { 353 "inference_cost_reported": { 354 "applies": true, 355 "answer": true, 356 "justification": "Table 1 reports total cost in USD for each configuration: Single-Model $17.04, Arena-2 $20.64, ACAR-U $20.34, Arena-3 $20.64.", 357 "source": "opus" 358 }, 359 "compute_budget_stated": { 360 "applies": true, 361 "answer": true, 362 "justification": "Total API costs stated per configuration. 7,550+ total runs documented. Latency reported in Figure 7.", 363 "source": "opus" 364 } 365 }, 366 "experimental_rigor": { 367 "seed_sensitivity_reported": { 368 "applies": true, 369 "answer": false, 370 "justification": "No multi-seed analysis. Temperature 0 used for determinism but no seed sensitivity study.", 371 "source": "opus" 372 }, 373 "number_of_runs_stated": { 374 "applies": true, 375 "answer": true, 376 "justification": "7,550+ total runs explicitly stated. 1,510 tasks per configuration. N=3 probe samples per task.", 377 "source": "opus" 378 }, 379 "hyperparameter_search_budget": { 380 "applies": true, 381 "answer": false, 382 "justification": "No hyperparameter search reported. N=3, σ thresholds, and retrieval similarity threshold of 0.0 appear chosen without systematic search.", 383 "source": "opus" 384 }, 385 "best_config_selection_justified": { 386 "applies": true, 387 "answer": true, 388 "justification": "All configurations reported and compared in Table 1. Both positive and negative results shown. No cherry-picking.", 389 "source": "opus" 390 }, 391 "multiple_comparison_correction": { 392 "applies": false, 393 "answer": false, 394 "justification": "No statistical tests performed, so multiple comparison correction is not applicable.", 395 "source": "opus" 396 }, 397 "self_comparison_bias_addressed": { 398 "applies": true, 399 "answer": false, 400 "justification": "Authors evaluate their own ACAR system against baselines they implemented. No acknowledgment of self-comparison bias.", 401 "source": "opus" 402 }, 403 "compute_budget_vs_performance": { 404 "applies": true, 405 "answer": true, 406 "justification": "Figure 4 shows cost vs accuracy Pareto frontier. Table 1 reports both accuracy and cost for all configurations.", 407 "source": "opus" 408 }, 409 "benchmark_construct_validity": { 410 "applies": true, 411 "answer": false, 412 "justification": "No discussion of whether the four benchmarks actually measure the capabilities ACAR claims to route for.", 413 "source": "opus" 414 }, 415 "scaffold_confound_addressed": { 416 "applies": true, 417 "answer": true, 418 "justification": "The paper explicitly addresses scaffolding as a variable: ACAR-U vs ACAR-UJ ablates the retrieval scaffold, and the routing mechanism itself is the scaffold being studied. The paper does not compare models across different scaffolds and attribute differences to models — it evaluates the scaffold (routing) as the primary variable.", 419 "source": "opus" 420 } 421 }, 422 "data_leakage": { 423 "temporal_leakage_addressed": { 424 "applies": true, 425 "answer": false, 426 "justification": "No discussion of whether benchmark problems existed before models' training cutoffs.", 427 "source": "opus" 428 }, 429 "feature_leakage_addressed": { 430 "applies": true, 431 "answer": false, 432 "justification": "No discussion of whether the evaluation setup leaks information through routing or probe context.", 433 "source": "opus" 434 }, 435 "non_independence_addressed": { 436 "applies": true, 437 "answer": false, 438 "justification": "No discussion of train/test independence across the four benchmarks.", 439 "source": "opus" 440 }, 441 "leakage_detection_method": { 442 "applies": true, 443 "answer": false, 444 "justification": "No leakage detection or prevention method applied.", 445 "source": "opus" 446 } 447 } 448 } 449 }, 450 "claims": [ 451 { 452 "claim": "σ-based routing achieves 55.6% accuracy, exceeding the two-model baseline (54.4%)", 453 "evidence": "Table 1 shows ACAR-U at 55.6% (839/1510 correct) vs Arena-2 at 54.4% (822/1510 correct)", 454 "supported": "strong" 455 }, 456 { 457 "claim": "Adaptive routing avoids full ensembling on 54.2% of tasks", 458 "evidence": "Figure 6 shows cumulative full-arena usage reaching 45.8% by final task, implying 54.2% routed to single-agent or lite modes", 459 "supported": "strong" 460 }, 461 { 462 "claim": "Retrieval augmentation with low-quality stores decreases accuracy by 3.4 percentage points overall", 463 "evidence": "Table 2 shows ACAR-U 55.6% vs ACAR-UJ 52.4%, difference of 3.2pp (paper claims 3.4pp—minor discrepancy)", 464 "supported": "strong" 465 }, 466 { 467 "claim": "Agreement-but-wrong failure mode bounds achievable accuracy at 8pp below full ensembling", 468 "evidence": "Table 1 shows ACAR-U 55.6% vs Arena-3 ceiling 63.6%, gap of 8.0pp. Section 6.2 explains when all probe samples agree incorrectly, no ensemble recovery possible", 469 "supported": "strong" 470 }, 471 { 472 "claim": "Retrieval utility requires semantic similarity threshold >0.7; median retrieved experiences had only 0.167 similarity", 473 "evidence": "Figure 9 shows median similarity 0.167 (p90: 0.833) for 837 retrieved experiences. Section 6.1 attributes performance loss to low-quality matches", 474 "supported": "strong" 475 }, 476 { 477 "claim": "Attribution proxies (response similarity, entropy) show weak correlation with ground-truth leave-one-out values", 478 "evidence": "Section 6.3 reports attribution experiment but provides no quantified correlation coefficients. Claims proxy signals 'showed weak correlation' without numerical support", 479 "supported": "moderate" 480 }, 481 { 482 "claim": "Self-consistency variance (σ) routing is model-agnostic and requires no learned components", 483 "evidence": "Algorithm 1 shows purely heuristic routing based on answer agreement counts. No training or model-specific tuning required", 484 "supported": "strong" 485 }, 486 { 487 "claim": "TEAMLLM provides deterministic execution with immutable artifacts enabling reproducible multi-model research", 488 "evidence": "Section 3.1 specifies three invariants: deterministic execution with logged seeds, immutable append-only artifacts, forward-only state machine. 7,550+ runs logged", 489 "supported": "strong" 490 } 491 ], 492 "methodology_tags": [ 493 "benchmark-eval" 494 ], 495 "key_findings": "ACAR achieves 55.6% accuracy on 1,510 multi-benchmark tasks by adaptively routing 54.2% to single-model execution based on self-consistency variance, exceeding two-model baselines while costing less. Three systematic failures are documented: retrieval augmentation hurts accuracy by 3.4pp unless similarity threshold >0.7 is enforced, models' unanimous incorrect agreement (σ=0) cannot be recovered by ensembling and bounds maximum achievable accuracy 8pp below full ensemble, and post-hoc attribution proxies (similarity, entropy) do not correlate with ground-truth leave-one-out contributions. The work prioritizes auditability and negative-result reporting over routing accuracy optimization.", 496 "red_flags": [ 497 { 498 "flag": "No statistical significance testing", 499 "detail": "1.2pp improvement (55.6% vs 54.4%) not tested for significance. With 1,510 tasks, confidence intervals and p-values could be computed but are not." 500 }, 501 { 502 "flag": "No confidence intervals or uncertainty quantification", 503 "detail": "All results reported as point estimates. No error bars, bootstrap CIs, or variance measures despite evaluating 1,510 tasks." 504 }, 505 { 506 "flag": "Model versions lack snapshot dates", 507 "detail": "Models identified as 'Claude Sonnet 4,' 'GPT-4o,' 'Gemini 2.0 Flash' without version checkpoints or training cutoff dates. Reproducibility limited without exact model versions." 508 }, 509 { 510 "flag": "Prompts not provided", 511 "detail": "Paper references 'prompt template hash' but does not provide actual system prompts or task prompts. Reproducibility requires prompt access." 512 }, 513 { 514 "flag": "Contamination not addressed", 515 "detail": "No discussion of whether benchmark tasks overlap with model training data. No explicit verification of test set cleanliness relative to model cutoffs." 516 }, 517 { 518 "flag": "Sample size not justified", 519 "detail": "Evaluation uses 1,510 tasks but provides no justification. No power analysis or sample size calculation discussed." 520 }, 521 { 522 "flag": "Attribution analysis incomplete", 523 "detail": "Section 6.3 claims proxy signals 'showed weak correlation' with ground truth but provides no correlation coefficients, r-values, or statistical measures." 524 }, 525 { 526 "flag": "Benchmark composition severely imbalanced", 527 "detail": "SuperGPQA comprises 1,000/1,510 tasks (66%). Results may overweight multiple-choice QA performance." 528 }, 529 { 530 "flag": "No funding or affiliation disclosure", 531 "detail": "Author listed without institutional affiliation. No funding source stated. Standard disclosure statements absent." 532 }, 533 { 534 "flag": "Answer extraction method not specified", 535 "detail": "EXTRACT(ri) function referenced in Algorithm 1 but not defined. How answers are canonicalized from model responses unclear." 536 } 537 ], 538 "cited_papers": [ 539 { 540 "title": "RouterBench: A benchmark for multi-LLM routing system", 541 "authors": "Hu et al.", 542 "year": 2024, 543 "relevance": "Establishes benchmark for evaluating LLM routing systems; ACAR contrasts with learned routing approaches" 544 }, 545 { 546 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 547 "authors": "Chen, Zaharia, Zou", 548 "year": 2023, 549 "relevance": "Cost-aware inference strategy; ACAR addresses similar cost-quality tradeoff but via self-consistency instead of learned cascades" 550 }, 551 { 552 "title": "RouteLLM: Learning to route LLMs with preference data", 553 "authors": "Ong et al.", 554 "year": 2025, 555 "relevance": "Learning-based routing; ACAR explicitly rejects learned routers in favor of heuristic interpretability" 556 }, 557 { 558 "title": "ReAct: Synergizing reasoning and acting in language models", 559 "authors": "Yao et al.", 560 "year": 2023, 561 "relevance": "Agentic reasoning with tools; tangential to multi-model routing but relevant for task complexity estimation" 562 }, 563 { 564 "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models on Code", 565 "authors": "[benchmark paper]", 566 "year": 2024, 567 "relevance": "Execution-verified code benchmark with temporal splits; used as evaluation dataset and exemplar of rigorous benchmark design" 568 }, 569 { 570 "title": "SuperGPQA: A Benchmark for LLM Knowledge Breadth", 571 "authors": "[benchmark paper]", 572 "year": 2024, 573 "relevance": "Large-scale multiple-choice QA benchmark; comprises 66% of evaluation tasks in this work" 574 } 575 ], 576 "engagement_factors": { 577 "practical_relevance": { 578 "score": 1, 579 "justification": "The routing concept is potentially useful but the specific implementation is tightly coupled to a custom substrate and the accuracy gains are marginal (1.2pp)." 580 }, 581 "surprise_contrarian": { 582 "score": 2, 583 "justification": "The finding that retrieval augmentation consistently hurts performance (-3.4pp) is counterintuitive and challenges the 'more context is better' assumption prevalent in RAG discourse." 584 }, 585 "fear_safety": { 586 "score": 0, 587 "justification": "No safety, security, or risk angle is present in the paper." 588 }, 589 "drama_conflict": { 590 "score": 0, 591 "justification": "No controversy, company criticism, or replication failure — the paper is a straightforward technical evaluation." 592 }, 593 "demo_ability": { 594 "score": 1, 595 "justification": "Code and artifacts are released on GitHub but require significant setup with three paid API providers to reproduce." 596 }, 597 "brand_recognition": { 598 "score": 0, 599 "justification": "Single unknown author, no venue, no institutional affiliation listed." 600 } 601 }, 602 "hn_data": { 603 "threads": [ 604 { 605 "hn_id": "47154950", 606 "title": "Aletheia Tackles FirstProof Autonomously", 607 "points": 5, 608 "comments": 0, 609 "url": "https://news.ycombinator.com/item?id=47154950", 610 "created_at": "2026-02-25T17:46:36Z" 611 }, 612 { 613 "hn_id": "47314080", 614 "title": "Latent Context Compilation: Distilling Long Context into Compact Portable Memory", 615 "points": 2, 616 "comments": 0, 617 "url": "https://news.ycombinator.com/item?id=47314080", 618 "created_at": "2026-03-09T19:21:30Z" 619 } 620 ], 621 "top_points": 5, 622 "total_points": 7, 623 "total_comments": 0 624 } 625 }