scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28377B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Dynamic Mix Precision Routing for Efficient Multi-step LLM Interaction",
      6     "authors": [
      7       "Yuanzhe Li",
      8       "Jianing Deng",
      9       "Jingtong Hu",
     10       "Tianlong Chen",
     11       "Song Wang"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv",
     15     "arxiv_id": "2602.02711",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All four abstract claims are supported: (1) LLMs succeed at long-horizon tasks (shown in intro), (2) inference cost is prohibitive (referenced in related work), (3) the framework is described in Section 4, (4) Table 1 demonstrates accuracy-cost improvements over baselines.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Ablation study (Section 5.3, Table 2) isolates KL-ST vs GRPO effects; comparisons against random routing and full-precision baselines support causal claims. However, limited to one benchmark environment.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Paper frames problem broadly ('long-horizon agentic tasks', 'real-world scenarios') but evaluates only on ALFWorld. No explicit scope boundaries stated. Claims like 'effective precision routing does not require a full-capacity model' generalize beyond tested domain without justification.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No discussion of alternative explanations. For instance, GRPO shows no improvement on 1.7B model (Table 2) but paper only attributes this to 'inherent limitation' without exploring other factors. No consideration of confounds.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Measured outcomes (success rate, high-precision usage, GHC metric) map directly to intended claims (task performance and inference efficiency). No conflation between proxy and ground truth.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No dedicated limitations or threats-to-validity section. Conclusion (Section 6) is one paragraph summarizing findings without acknowledging scope boundaries.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No specific threats discussed. Single-environment evaluation, limited quantization methods (GPTQ only), ad-hoc hyperparameter selection (KL threshold 'manually selected based on empirical distribution'), and inconsistent GRPO gains across model scales are not acknowledged.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No explicit boundaries stated. Paper does not acknowledge what the approach does NOT apply to (e.g., single-turn QA, retrieval tasks, or models without quantized variants).",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding statement or acknowledgments section visible in paper. No disclosure of funding sources.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations listed at top (University of Arizona, University of Pittsburgh, UNC Chapel Hill, University of Central Florida). Affiliations are disclosed; however, no explicit conflict-of-interest statement provided.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funder identified.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing-interests or financial-disclosure statement included. Paper evaluates Qwen and DeepSeek models but does not disclose any potential financial relationships.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Core terms are defined: 'agentic tasks' as long-horizon decision-making requiring tool use and environment interaction (Section 1), 'dynamic mix-precision routing' explained with architecture (Section 4), 'quantization' used consistently with standard meaning.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three contributions explicitly stated in introduction: (1) dynamic framework for step-level routing, (2) RL optimization of routing policy, (3) lightweight router sufficiency. Clear and distinct contributions.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 engages with three areas: LLM routing (FrugalGPT, HybridLLM, RouteLLM, etc.), agentic tasks, and quantization. Paper positions this work as step-level routing (vs. query-level) on agentic tasks, showing how it addresses limitations of prior routing work.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository, GitHub link, or supplementary code provided. Paper is a preprint with no mention of code availability.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "ALFWorld benchmark is publicly available and evaluation follows standard protocol (Yao et al. 2022b unseen test set). However, the 200 training rollouts used for KL-ST supervision are not released.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements.txt, Dockerfile, or environment specification provided. Training details in Appendix B (batch size, learning rate) but no computational environment or dependency specs.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Experimental setup described in Section 5 and Appendix, but no step-by-step reproduction instructions. No code release makes reproduction difficult or impossible.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Table 1 reports success rates and metrics as point estimates with no confidence intervals, error bars, or standard deviations. Single evaluation run implied; no variance reporting.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests (t-tests, Mann-Whitney, etc.) reported. Comparisons presented descriptively (e.g., 88.8% vs 89.6%) without p-values or significance indicators.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "GHC metric directly measures effect size (gain per high-precision call). Improvements are quantified in percentage points. However, no formal effect-size statistics (Cohen's d, etc.) or confidence bounds reported.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "KL-ST uses 200 episodes (stated in Appendix B), GRPO uses 120 episodes. No power analysis or justification for these choices. Table 3 ablates data scale (100–400) showing variance in results but final choice not justified.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, std dev, or repeated runs reported. Especially problematic for RL results which are typically high-variance. Only point estimates in tables.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Comparisons include full-precision baseline (BF16), quantized-only baseline (GPTQ), and random routing at multiple thresholds (20%, 40%, 60%, 80%). Strong baseline set.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "BF16 and GPTQ baselines are contemporary. However, no comparison to other learned routing methods mentioned in related work (RouteLLM, BEST-Route, RouterDC, etc.), only random and single-precision baselines.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 5.3 compares KL-ST-only, GRPO-only, and KL-ST+GRPO (Figure 5, Table 2). Table 3 ablates training data scale (100–400 episodes). Ablations show both components contribute.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Success rate (primary), high-precision usage ratio, and GHC (unified efficiency metric) reported. Multiple perspectives on performance–cost trade-off.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Not applicable. Task success is objective (environment-based); no human evaluation needed or provided.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Evaluation on 'unseen test task' following Yao et al. (2022b); ALFWorld has standard train/test split.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "Results broken down by model family (Qwen, DeepSeek) and size (8B, 4B, 1.7B) but no breakdown by task type, difficulty, or failure mode within ALFWorld.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "Appendix A.2 provides one qualitative case study (cleaning cloth task) showing low-precision failure and router success. No systematic failure analysis or taxonomy of failure modes.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "GRPO-only achieves lower GHC than KL-ST+GRPO (Table 2). GRPO provides no benefit on 1.7B model. Table 3 shows GHC fluctuations with data scale. Some negative/mixed results included.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Model families and sizes specified (Qwen3-8B/4B/1.7B, DeepSeek-R1-Distill-Llama3-8B) but no version snapshots or release dates. For reproducibility, commit hashes or exact model cards needed.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "No system prompt or instruction template provided. For ALFWorld this may be standard, but explicit prompt text is absent. Critical for reproducibility of LLM outputs.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "KL-ST hyperparameters provided (batch 64, lr 1e-4, 5 epochs, Appendix B). GRPO hyperparameters provided (lr 1e-6, KL weight 0.02, 120 episodes). However, KL threshold selection described as 'manually selected between 78th and 85th percentiles'—vague and ad-hoc.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Router architecture detailed in Section 4 and Figure 3: 2-layer Transformer encoder with position embeddings, masked attention, and softmax classification. Scaffolding is the routing framework itself and is well-described.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": false,
    262           "justification": "Section 4.1 states step embeddings are 'pre-computed by an external encoder and treated as fixed inputs' but does not specify which encoder, how it works, or its parameters. Critical preprocessing detail missing.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "ALFWorld is publicly available. However, the 200 high-precision rollouts collected for KL-ST training are not released, limiting reproducibility of training data.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 4.2 describes trajectory sampling protocol: high-precision rollout strategy, measurement of step-wise KL divergence, retention of successful trajectories only. Well-documented collection process.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human subjects involved.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Pipeline from trajectory collection → KL divergence computation → KL-ST training → GRPO optimization is documented in Sections 4.2–4.3 and Appendix B with sufficient detail.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training cutoff date stated for Qwen3 or DeepSeek-R1 models. These are 2024–2025 models but exact training data cutoff unknown. ALFWorld cutoff not discussed.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of potential overlap between ALFWorld test set and base model training data. With models released in 2025–2026 evaluating a 2021 benchmark, contamination risk is not addressed.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "ALFWorld (2021) may have been included in base model training data. Paper does not acknowledge or test for this risk.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "High-precision usage ratio and GHC metric quantify inference cost trade-offs. However, actual latency, wall-clock time, or GPU hours are NOT reported. Cost is relative, not absolute.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Episode budgets stated (200 for KL-ST, 120 for GRPO) but no compute cost in GPU hours, FLOPs, or dollars. Cannot assess practical compute requirements.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Low-precision quantized LLMs exhibit step-wise diversity in sensitivity: most steps tolerate quantization but a small fraction are 'critical' and require high precision.",
    375       "evidence": "Figure 4 shows step-wise KL divergence distribution is 'highly skewed' with heavy right tail across Qwen3-8B, Qwen3-4B, and DeepSeek models. Section 1 and 4.2 frame this observation.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Dynamic step-level mix-precision routing achieves superior accuracy–cost trade-off compared to random routing and full-precision baselines.",
    380       "evidence": "Table 1: Router achieves GHC scores (19.85–43.02) consistently higher than random routing (2–18.67) across all models. Approaches full-precision performance (89.6% vs 88.8% on Qwen3-8B) with 26.7% high-precision usage.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "KL-divergence-based supervised learning (KL-ST) identifies precision-sensitive steps better than unsupervised methods.",
    385       "evidence": "Figure 5 and Table 2: KL-ST achieves higher GHC than GRPO-only (18.79 vs 10.74 on Qwen3-8B). KL-ST provides 'stable initialization' for downstream RL. Table 3 shows KL-ST data scale correlates with GHC improvement.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Group-Relative Policy Optimization (GRPO) further improves routing decisions over KL-ST alone.",
    390       "evidence": "Figure 5 and Table 2: KL-ST+GRPO (GHC 19.85) outperforms KL-ST-only (18.79) on Qwen3-8B. However, on smaller models (1.7B), GRPO provides NO improvement. Gain correlates with base model capability, not uniform.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "A lightweight two-layer Transformer router is sufficient for step-level precision routing without requiring a full-capacity LLM.",
    395       "evidence": "Section 4.1 and Figure 3 describe router architecture (2 layers, position embeddings, last-token pooling). No comparison to larger or smaller routers provided. No justification for architecture choice.",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "The approach generalizes across LLM families and model scales (Qwen, DeepSeek; 1.7B–8B).",
    400       "evidence": "Table 1 shows results on Qwen3-8B/4B/1.7B and DeepSeek-R1-Distill-Llama3-8B. However, all evaluation is on ALFWorld only. No evaluation on other agentic benchmarks (WebArena, ScienceWorld, etc.).",
    401       "supported": "weak"
    402     },
    403     {
    404       "claim": "The router efficiently reduces high-precision calls without sacrificing task success (e.g., 26.7% high-precision usage achieves 88.8% success vs 100% for full-precision at 89.6%).",
    405       "evidence": "Table 1: Qwen3-8B router uses 26.7% high-precision and achieves 88.8% success vs 100% high-precision at 89.6% success. GHC=19.85 indicates efficient trade-off. Results limited to ALFWorld.",
    406       "supported": "strong"
    407     },
    408     {
    409       "claim": "Routing decisions based on behavioral divergence (KL) are more effective than random routing at equivalent cost budgets.",
    410       "evidence": "Table 1: Router@26.7% high-precision (GHC 19.85) outperforms Random@20% (GHC 2) and Random@40% (GHC 8.5) on Qwen3-8B. Similar pattern across all models.",
    411       "supported": "strong"
    412     }
    413   ],
    414   "methodology_tags": [
    415     "benchmark-eval",
    416     "empirical"
    417   ],
    418   "key_findings": "The paper demonstrates that step-wise precision sensitivity in quantized LLMs is heterogeneous—most steps tolerate quantization but critical decision points require full precision. A lightweight two-layer Transformer router trained via KL-divergence supervision and policy optimization can dynamically select between high- and low-precision models, achieving 26.7–88% high-precision usage while maintaining near-full-precision task success (88.8% vs 89.6%) on ALFWorld. The approach consistently outperforms random routing baselines, though improvements diminish on smaller models (1.7B), and has been validated only on a single benchmark environment.",
    419   "red_flags": [
    420     {
    421       "flag": "Single-benchmark evaluation",
    422       "detail": "All experiments on ALFWorld only. No evaluation on WebArena, ScienceWorld, or other agentic benchmarks limits claim generalization to 'real-world agentic tasks' (Section 1, page 1)."
    423     },
    424     {
    425       "flag": "No confidence intervals or repeated runs",
    426       "detail": "Table 1 reports point estimates with no error bars, std dev, or repeated evaluations. Single-run results; RL typically exhibits high variance, especially with sparse rewards (problem stated in Section 4.3)."
    427     },
    428     {
    429       "flag": "Missing external encoder specification",
    430       "detail": "Step embeddings pre-computed by 'external encoder' (Section 4.1, page 3) but encoder type, architecture, and training unknown. Critical preprocessing detail omitted, hindering reproducibility."
    431     },
    432     {
    433       "flag": "Ad-hoc hyperparameter selection",
    434       "detail": "KL threshold selected 'manually between 78th and 85th percentiles' (Appendix B, page 13). No ablation on threshold value. Selection method not principled."
    435     },
    436     {
    437       "flag": "Inconsistent GRPO gains by model scale",
    438       "detail": "GRPO improves GHC on Qwen3-4B (26.43→43.02) and 8B (18.79→19.85) but provides zero improvement on 1.7B (Table 2). Attribution to 'inherent router limitation' lacks depth; alternative explanations unexplored."
    439     },
    440     {
    441       "flag": "No comparison to learned routing baselines",
    442       "detail": "Section 2 cites RouteLLM, BEST-Route, RouterDC, and others, but experiments compare only to random routing and single-precision baselines. No head-to-head comparison to other routers."
    443     },
    444     {
    445       "flag": "Training data scale variance unexplained",
    446       "detail": "Table 3 shows GHC fluctuations (100 eps: 14.71, 200: 18.79, 300: 14.56, 400: 25.27). High variance not investigated; why 300 episodes worse than 200 and 400?"
    447     },
    448     {
    449       "flag": "No limitations section or scope boundaries",
    450       "detail": "Paper frames broadly ('long-horizon agentic tasks', 'real-world') but explicitly bounds only to ALFWorld. No discussion of what approach does NOT work for (e.g., single-turn QA, retrieval, open-ended generation)."
    451     },
    452     {
    453       "flag": "Potential benchmark contamination not addressed",
    454       "detail": "ALFWorld released 2021; Qwen3 and DeepSeek-R1 likely trained 2024–2025. Possibility of test-set overlap in base model training not tested or discussed."
    455     },
    456     {
    457       "flag": "No code or training data release",
    458       "detail": "Reproduction impossible without code. The 200 high-precision rollouts for KL-ST training are not released. Evaluation uses public ALFWorld but training is unreproducible."
    459     }
    460   ],
    461   "cited_papers": [
    462     {
    463       "title": "Reflexion: Language agents with verbal reinforcement learning",
    464       "relevance": "Early work on agentic language model scaffolding and trajectory-level optimization; foundational for multi-step reasoning."
    465     },
    466     {
    467       "title": "ALFWorld: Aligning text and embodied environments for interactive learning",
    468       "relevance": "Primary evaluation benchmark; establishes the agentic task domain (household manipulation via natural language)."
    469     },
    470     {
    471       "title": "ReAct: Synergizing reasoning and acting in language models",
    472       "relevance": "Core agentic framework combining reasoning and tool use; baseline for step-level decision-making architectures."
    473     },
    474     {
    475       "title": "RouteLLM: Learning to route llms with preference data",
    476       "relevance": "Prior routing work at query level; this paper advances to step-level routing. Direct predecessor in routing literature."
    477     },
    478     {
    479       "title": "Can compressed llms truly act? an empirical evaluation of agentic capabilities in LLM compression",
    480       "relevance": "Directly relevant: quantization causes agentic failure on interactive tasks, motivating the precision-routing approach."
    481     },
    482     {
    483       "title": "BEST-Route: Adaptive LLM routing with test-time optimal compute",
    484       "relevance": "Concurrent work on compute-aware routing; establishes the importance of cost-benefit trade-offs in routing."
    485     },
    486     {
    487       "title": "Quantization hurts reasoning? an empirical study on quantized reasoning models",
    488       "relevance": "Shows quantization degrades performance on reasoning-heavy tasks; supports motivation for adaptive precision selection."
    489     },
    490     {
    491       "title": "SWE-bench: Can language models resolve real-world github issues?",
    492       "relevance": "Alternative agentic benchmark (software engineering); highlights generalization question beyond ALFWorld."
    493     },
    494     {
    495       "title": "WebArena: A realistic web environment for building autonomous agents",
    496       "relevance": "Another agentic benchmark domain (web navigation); would provide generalization evidence if evaluated here."
    497     }
    498   ],
    499   "engagement_factors": {
    500     "practical_relevance": {
    501       "score": 2,
    502       "justification": "Addresses real cost-efficiency problem for deployed agentic systems, but requires retraining router per model/benchmark pair and depends on GPTQ quantization availability. Limited immediate applicability."
    503     },
    504     "surprise_contrarian": {
    505       "score": 2,
    506       "justification": "Finding that quantized models fail at 'critical steps' is intuitive (confirms practitioner belief). Step-level routing is incremental advance over query-level. No surprising or contrarian insight."
    507     },
    508     "fear_safety": {
    509       "score": 0,
    510       "justification": "No discussion of safety, alignment, or AI risk. Paper is purely about efficiency optimization."
    511     },
    512     "drama_conflict": {
    513       "score": 0,
    514       "justification": "Straightforward technical contribution; no controversy, heated debates, or conflicting findings."
    515     },
    516     "demo_ability": {
    517       "score": 1,
    518       "justification": "No code release, so demo requires reimplementation. Figure 1 case study is illustrative but no runnable demo provided. Difficult to try immediately."
    519     },
    520     "brand_recognition": {
    521       "score": 1,
    522       "justification": "Authors from University of Arizona, Pittsburgh, UNC Chapel Hill, UCF—solid institutions but not top-tier ML labs (no Anthropic, OpenAI, DeepMind, Meta affiliations)."
    523     }
    524   },
    525   "hn_data": {
    526     "threads": [],
    527     "top_points": 0,
    528     "total_points": 0,
    529     "total_comments": 0
    530   }
    531 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs