scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (31393B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Dynamic Mix Precision Routing for Efficient Multi-step LLM Interaction",
      6     "authors": [
      7       "Yuanzhe Li",
      8       "Jianing Deng",
      9       "Jingtong Hu",
     10       "Tianlong Chen",
     11       "Song Wang",
     12       "Huanrui Yang"
     13     ],
     14     "year": 2026,
     15     "venue": "arXiv",
     16     "arxiv_id": "2602.02711",
     17     "doi": null
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract claims 'great improvement on accuracy–cost trade-off over single-precision baselines and heuristic routing methods.' Table 1 supports this: the router achieves the highest GHC across all model configurations.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The ablation study (Table 2) uses controlled single-variable manipulation: KL-ST only, GRPO only, and KL-ST+GRPO. This is an adequate design for the causal claims about component contributions.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title claims 'Multi-step LLM Interaction' generally, and Section 1 claims the framework is for 'long-horizon agentic tasks.' But experiments are only on ALFWorld — no evaluation on other agentic benchmarks (WebArena, ScienceWorld, SWE-bench) despite citing them in related work.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No discussion of alternative explanations for the results. For example, whether the router merely learns to predict task difficulty rather than precision sensitivity, or whether simple heuristics (e.g., step position in trajectory) could explain the routing decisions.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper directly measures what it claims: task success rate and high-precision usage ratio. The GHC metric is explicitly defined as combining these two. No proxy gap exists between measurement and claim.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No dedicated limitations, threats to validity, or discussion section exists. The conclusion (Section 6) is 4 sentences with no discussion of limitations.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No threats to validity are discussed anywhere in the paper.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No explicit statements about what the results do NOT show. The paper does not acknowledge limitations of testing on only one benchmark or one task domain.",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding information or acknowledgments section is present in the paper.",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are clearly listed: University of Arizona, University of Pittsburgh, University of North Carolina at Chapel Hill, University of Central Florida. All academic institutions.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No funding is disclosed, so independence cannot be assessed. The paper is a multi-university collaboration where funding would typically exist.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial disclosure statement is present in the paper.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Key terms defined: 'agentic tasks' (long-horizon decision making, tool use, environment interaction); 'mix-precision' (select between high/low precision LLMs); 'quantization' (post-training quantization methods); 'routing' (step-level model selection).",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Three contributions explicitly stated in introduction: (1) dynamic mix-precision framework, (2) RL-based policy optimization, (3) demonstration that lightweight router suffices. Reader clearly knows what is novel.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Related Work section (2) distinguishes step-level routing from prior query-level routing (RouteLLM, RouterDC, BEST-Route); explains why LLM quantization fails on agentic tasks; positions contribution as novel in fine-grained routing for sequential decision-making.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper.",
    126           "source": "opus"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "The paper uses ALFWorld, a publicly available benchmark environment (Shridhar et al., 2021). No proprietary data was collected.",
    132           "source": "opus"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "The paper specifies model names and quantization methods (GPTQ 3-bit/4-bit) but provides no requirements.txt, Dockerfile, or detailed environment setup with library versions.",
    138           "source": "opus"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No step-by-step reproduction instructions, README, or scripts for replicating experiments are provided.",
    144           "source": "opus"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Table 1 and Table 2 report only point estimates for success rate, high-precision ratio, and GHC. No confidence intervals, error bars, or ± notation appear anywhere.",
    152           "source": "opus"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "The paper claims routing 'achieves a great improvement' and 'consistently achieves superior performance–cost trade-offs' but provides no statistical significance tests — all comparisons are based on raw number differences.",
    158           "source": "opus"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Table 1 provides success rates for all methods alongside their high-precision ratios, and the GHC metric quantifies improvement magnitude per unit of high-precision cost. For example, the router achieves 88.8% (vs 82.8% quantized-only baseline) with 26.7% HP ratio, giving sufficient context to understand effect magnitude.",
    164           "source": "opus"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "No justification for the number of evaluation episodes used. The number of ALFWorld test tasks is not stated explicitly, nor is any power analysis provided.",
    170           "source": "opus"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No standard deviations, variance across seeds, or spread measures are reported. All results appear to be single-run point estimates.",
    176           "source": "opus"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Table 1 compares against full-precision BF16, quantized-only (GPTQ), and random routing at 20/40/60/80% high-precision ratios across four model configurations.",
    184           "source": "opus"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "The baselines are only random routing at various ratios and single-precision inference. No comparison with existing routing methods from the related work (FrugalGPT, RouteLLM, HybridLLM, BEST-Route, Router-R1) despite extensive discussion of them in Section 2.",
    190           "source": "opus"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Table 2 ablates KL-ST vs GRPO-only vs KL-ST+GRPO. Table 3 ablates KL-ST training data scale (100/200/300/400 episodes). Both are systematic single-variable manipulations.",
    196           "source": "opus"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Three metrics are reported: success rate, high-precision usage ratio, and Gain per High-Precision Call (GHC). These capture effectiveness, cost, and their trade-off.",
    202           "source": "opus"
    203         },
    204         "human_evaluation": {
    205           "applies": false,
    206           "answer": false,
    207           "justification": "The paper evaluates routing in an automated simulator (ALFWorld) with binary task success. Human evaluation is not relevant to the claims.",
    208           "source": "opus"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Section 5.1 states 'we evaluate on the unseen test task following Yao et al. (2022b),' using ALFWorld's standard unseen test split.",
    214           "source": "opus"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": false,
    219           "justification": "ALFWorld has 6 task types (pick, clean, heat, cool, examine, pick-two) but no per-task-type breakdown is provided. Results are only broken down by model configuration, not by task category.",
    220           "source": "opus"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Figure 1 and Appendix A.2 provide detailed case studies of trajectories where the low-precision model fails at critical steps while the router succeeds, with step-by-step traces.",
    226           "source": "opus"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Table 2 shows GRPO provides no improvement on Qwen3-1.7B (identical results to KL-ST). Section 5.3 discusses this: 'the effectiveness of routing-based optimization is bounded by the expressiveness and capability of the high-precision model itself.'",
    232           "source": "opus"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": false,
    239           "justification": "Models are specified as 'Qwen3-8B', 'Qwen3-4B', 'Qwen3-1.7B', and 'DeepSeek-R1-Distill-Llama3-8B' without specific checkpoint versions, HuggingFace model IDs, or snapshot dates. 'Post-training form released by the authors' is vague.",
    240           "source": "opus"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "The paper uses ReAct-style prompting for ALFWorld but does not provide the actual prompt text. Only interaction traces (action/observation pairs) are shown in examples.",
    246           "source": "opus"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Appendix B reports training hyperparameters: learning rate 1e-4, batch size 64, 5 epochs for KL-ST; learning rate 1e-6, βKL=0.02, STRONG_COST=0.02 for GRPO. Router threshold between 78th-85th percentile of KL distribution.",
    252           "source": "opus"
    253         },
    254         "scaffolding_described": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "The routing framework is described in detail: Section 4.1 covers the 2-layer Transformer encoder architecture, step-level state representation, positional embeddings, and masked pooling. Section 3 formalizes the routing decision process. Figures 2 and 3 show the architecture.",
    258           "source": "opus"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Section 4.2 describes the trajectory sampling protocol: high-precision rollouts, retention of successful trajectories only, step-wise KL computation, CDF-based thresholding to binary labels, and class-specific weighting for imbalance.",
    264           "source": "opus"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "No raw data (trajectories, KL divergence values, routing decisions) is released for independent verification.",
    272           "source": "opus"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 4.2 describes the trajectory sampling protocol in detail: 200 episodes from high-precision rollouts, filtering for successful trajectories, computing action distributions from both precision levels at each step.",
    278           "source": "opus"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human participants. The data source is ALFWorld, a standard public benchmark.",
    284           "source": "opus"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The pipeline is documented: high-precision rollouts → filter successful trajectories → compute step-wise KL → apply CDF threshold for binary labels → class-weighted training. GRPO pipeline (120 episodes, K rollouts per instance) is also described in Appendix B.",
    290           "source": "opus"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "No training data cutoff dates are stated for Qwen3 or DeepSeek-R1-Distill models.",
    298           "source": "opus"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "No discussion of whether the models' training data could include ALFWorld task descriptions, solutions, or related content.",
    304           "source": "opus"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "ALFWorld was published in 2021. All models used (Qwen3, DeepSeek-R1-Distill) were trained after 2021. No contamination analysis is provided.",
    310           "source": "opus"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants in this study.",
    318           "source": "opus"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants in this study.",
    324           "source": "opus"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants in this study.",
    330           "source": "opus"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants in this study.",
    336           "source": "opus"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants in this study.",
    342           "source": "opus"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants in this study.",
    348           "source": "opus"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants in this study.",
    354           "source": "opus"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": true,
    361           "justification": "High-precision usage ratio is reported for all methods (Table 1), which directly measures the fraction of expensive inference calls. The GHC metric explicitly captures cost-effectiveness. However, no actual wall-clock time or dollar costs are provided.",
    362           "source": "opus"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "Training uses 200 episodes for KL-ST and 120 for GRPO (Appendix B), but no GPU hours, wall-clock time, or hardware specifications are reported.",
    368           "source": "opus"
    369         }
    370       },
    371       "experimental_rigor": {
    372         "seed_sensitivity_reported": {
    373           "applies": true,
    374           "answer": false,
    375           "justification": "No mention of multiple random seeds. All results appear to be single-seed runs.",
    376           "source": "opus"
    377         },
    378         "number_of_runs_stated": {
    379           "applies": true,
    380           "answer": false,
    381           "justification": "The number of evaluation runs per configuration is never stated. Results are presented as point estimates without clarifying how many runs produced them.",
    382           "source": "opus"
    383         },
    384         "hyperparameter_search_budget": {
    385           "applies": true,
    386           "answer": false,
    387           "justification": "The KL threshold τ is 'manually selected based on the empirical distribution' (Appendix B) between 78th-85th percentiles. No systematic search budget is reported.",
    388           "source": "opus"
    389         },
    390         "best_config_selection_justified": {
    391           "applies": true,
    392           "answer": true,
    393           "justification": "Appendix B states: 'Model selection is performed by choosing the checkpoint that achieves the highest validation accuracy across all training epochs.' Selection criterion and split are specified.",
    394           "source": "opus"
    395         },
    396         "multiple_comparison_correction": {
    397           "applies": false,
    398           "answer": false,
    399           "justification": "No statistical significance tests are performed, so multiple comparison correction does not apply.",
    400           "source": "opus"
    401         },
    402         "self_comparison_bias_addressed": {
    403           "applies": true,
    404           "answer": false,
    405           "justification": "The baselines (random routing) are all implemented by the authors. No discussion of self-evaluation bias or comparison with independently implemented baselines.",
    406           "source": "opus"
    407         },
    408         "compute_budget_vs_performance": {
    409           "applies": true,
    410           "answer": true,
    411           "justification": "Figure 5 and Table 1 plot success rate against high-precision usage ratio, which directly measures compute allocation. The GHC metric normalizes performance gain by compute cost.",
    412           "source": "opus"
    413         },
    414         "benchmark_construct_validity": {
    415           "applies": true,
    416           "answer": false,
    417           "justification": "Section 5.1 asserts ALFWorld is 'a representative benchmark for evaluating agentic ability' but does not critically assess whether ALFWorld's text-based household tasks actually measure the 'long-horizon decision making' capability claimed.",
    418           "source": "opus"
    419         },
    420         "scaffold_confound_addressed": {
    421           "applies": true,
    422           "answer": true,
    423           "justification": "All comparisons use the same ReAct-based interaction framework with ALFWorld. The only variable is precision routing — the scaffold is held constant across all conditions.",
    424           "source": "opus"
    425         }
    426       },
    427       "data_leakage": {
    428         "temporal_leakage_addressed": {
    429           "applies": true,
    430           "answer": false,
    431           "justification": "No discussion of whether the models' training data included ALFWorld-related content or solutions from after the benchmark's release.",
    432           "source": "opus"
    433         },
    434         "feature_leakage_addressed": {
    435           "applies": true,
    436           "answer": false,
    437           "justification": "No discussion of whether the evaluation setup leaks information through context that would not be available in genuine deployment.",
    438           "source": "opus"
    439         },
    440         "non_independence_addressed": {
    441           "applies": true,
    442           "answer": false,
    443           "justification": "No discussion of whether training episodes (used for router training) and test episodes share structural similarities or task templates.",
    444           "source": "opus"
    445         },
    446         "leakage_detection_method": {
    447           "applies": true,
    448           "answer": false,
    449           "justification": "No leakage detection or prevention method is applied.",
    450           "source": "opus"
    451         }
    452       }
    453     }
    454   },
    455   "claims": [
    456     {
    457       "claim": "Large language models achieve strong performance in long-horizon decision-making tasks through multi-step interaction.",
    458       "evidence": "BF16 full-precision Qwen3-8B achieves 89.6% success rate on ALFWorld unseen test tasks.",
    459       "supported": "strong"
    460     },
    461     {
    462       "claim": "Low-precision quantized LLMs suffer significant degradation on agentic tasks compared to full-precision.",
    463       "evidence": "GPTQ-3bit Qwen3-8B achieves 82.8% success vs 89.6% for BF16 (6.8pp gap); 3-bit DeepSeek achieves 44.0% vs 62.7% BF16 (18.7pp gap).",
    464       "supported": "strong"
    465     },
    466     {
    467       "claim": "Sensitivity to quantization is unevenly distributed across decision steps, forming a heavy-tailed distribution.",
    468       "evidence": "Figure 4 shows KL divergence distribution between low/high precision models is highly skewed with most steps near-zero divergence and rare tail of critical steps.",
    469       "supported": "strong"
    470     },
    471     {
    472       "claim": "A lightweight two-layer Transformer router trained with KL divergence supervision can effectively identify precision-critical steps.",
    473       "evidence": "KL-ST alone improves GHC from 2 (random@20%) to 18.79 (Qwen3-8B); demonstrates KL divergence is predictive of routing decisions.",
    474       "supported": "strong"
    475     },
    476     {
    477       "claim": "Group-Relative Policy Optimization further improves routing policy beyond supervised initialization.",
    478       "evidence": "Table 2 shows KL-ST+GRPO achieves GHC 43.02 vs KL-ST alone 26.43 on Qwen3-4B; Figure 5 shows GRPO improves frontier on both models tested.",
    479       "supported": "moderate"
    480     },
    481     {
    482       "claim": "The method achieves near-full-precision performance while using less than 30% high-precision model invocations.",
    483       "evidence": "Qwen3-8B router achieves 88.8% success (vs 89.6% BF16) using 26.7% high-precision calls; Qwen3-4B achieves 81.3% (vs 93.3% BF16) with 8.6% high-precision.",
    484       "supported": "moderate"
    485     },
    486     {
    487       "claim": "Step-level routing outperforms random routing at matched high-precision budgets.",
    488       "evidence": "Router@26.7% high-precision (GHC 19.85) outperforms Random@20% (GHC 2) and Random@40% (GHC 8.5) on Qwen3-8B.",
    489       "supported": "strong"
    490     },
    491     {
    492       "claim": "The effectiveness of routing-based optimization is bounded by the capability of the underlying high-precision model.",
    493       "evidence": "Table 2 shows GRPO provides no improvement on Qwen3-1.7B (GHC unchanged at 21.95) where high-precision baseline is weak (69.4% success), but significant improvements on Qwen3-4B (GHC 26.43→43.02) where baseline is stronger (93.3%).",
    494       "supported": "moderate"
    495     }
    496   ],
    497   "methodology_tags": [
    498     "empirical",
    499     "benchmark-eval",
    500     "ablation"
    501   ],
    502   "key_findings": "The paper demonstrates that LLM quantization degrades agentic task performance unevenly across decision steps, with a small number of 'critical steps' requiring high-precision inference. A lightweight step-level router trained via KL-divergence supervision and GRPO achieves near-full-precision performance (88.8% vs 89.6%) using only 26.7% high-precision model invocations on ALFWorld. The method consistently outperforms random routing baselines across multiple model scales (Qwen3 8B/4B/1.7B, DeepSeek-R1) and outperforms prior query-level routing approaches by operating at finer granularity.",
    503   "red_flags": [
    504     {
    505       "flag": "Single benchmark evaluation",
    506       "detail": "All experiments on ALFWorld only. No evaluation on WebArena, SWE-Bench, or other agentic benchmarks. Generalization to other sequential decision-making tasks (web navigation, code execution, tool use) not demonstrated."
    507     },
    508     {
    509       "flag": "No statistical significance testing",
    510       "detail": "Results reported as point estimates with no confidence intervals, error bars, or significance tests. Unknown whether improvements are statistically significant or within noise margins."
    511     },
    512     {
    513       "flag": "Missing exact model versions",
    514       "detail": "Model names (Qwen3-8B, DeepSeek-R1-Distill-Llama3-8B) provided but no snapshot dates, commit hashes, or version numbers. Reproducibility compromised."
    515     },
    516     {
    517       "flag": "No code or data release",
    518       "detail": "Method cannot be reproduced, compared against, or extended by other researchers. Training data (KL-ST trajectories) and trained routers not available."
    519     },
    520     {
    521       "flag": "Inconsistent GRPO benefits across model scales",
    522       "detail": "Table 2 shows GRPO improves GHC by 2.7-16.6pp on larger models but provides zero improvement on Qwen3-1.7B. Explanation ('router action space limitation') is vague; unclear when method will fail."
    523     },
    524     {
    525       "flag": "Missing baseline comparisons to prior routing methods",
    526       "detail": "Related Work discusses RouteLLM, RouterDC, BEST-Route but these are not included in experiments. Only random routing compared; cannot assess whether improvements are due to step-level routing or just the specific architecture."
    527     },
    528     {
    529       "flag": "Hyperparameter sensitivity unexplored",
    530       "detail": "CDF threshold τ chosen as 78th-85th percentiles but no sensitivity analysis. Learning rates manually selected. Robustness to hyperparameter choices unknown."
    531     },
    532     {
    533       "flag": "No failure mode analysis",
    534       "detail": "Only one success case study provided (Appendix A.2). No systematic analysis of when routing fails, which tasks are hard for the router, or edge cases."
    535     },
    536     {
    537       "flag": "Custom metric (GHC) not validated",
    538       "detail": "GHC metric introduced without comparison to standard cost-effectiveness metrics from economics or engineering. Unclear if ranking by GHC aligns with practitioner preferences."
    539     },
    540     {
    541       "flag": "Limited negative results",
    542       "detail": "GRPO failure on smallest model mentioned briefly; fluctuations in Table 3 not analyzed. Otherwise only positive results reported, suggesting potential publication bias."
    543     }
    544   ],
    545   "cited_papers": [
    546     {
    547       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    548       "relevance": "Foundational work on agentic LLM interaction; compared method against in Related Work for step-level reasoning in long-horizon tasks."
    549     },
    550     {
    551       "title": "ALFWorld: Aligning Text and Embodied Environments for Interactive Learning",
    552       "relevance": "Primary evaluation benchmark; provides text-based embodied environment for testing agent decision-making under partial observability."
    553     },
    554     {
    555       "title": "RouteLLM: Learning to Route LLMs with Preference Data",
    556       "relevance": "Prior routing method for combining multiple LLMs; positioned as prior work routing at query-level, motivating step-level routing novelty."
    557     },
    558     {
    559       "title": "Can Compressed LLMs Truly Act? An Empirical Evaluation of Agentic Capabilities in LLM Compression",
    560       "relevance": "Empirical study showing quantization degrades agentic performance; motivates the problem of maintaining performance under compression."
    561     },
    562     {
    563       "title": "AWQ: Activation-Aware Weight Quantization for On-Device LLM Compression",
    564       "relevance": "Quantization technique (GPTQ competitor) for creating low-precision model variants used in experiments."
    565     },
    566     {
    567       "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models",
    568       "relevance": "Example agentic task requiring long-horizon planning and interaction; cited as representative of modern LLM agent applications."
    569     },
    570     {
    571       "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
    572       "relevance": "Demonstrates LLMs learning to use tools through interaction; relevant to scaffolding and agent design patterns."
    573     }
    574   ],
    575   "engagement_factors": {
    576     "practical_relevance": {
    577       "score": 2,
    578       "justification": "Potentially useful for practitioners reducing inference costs, but blocked by lack of code release and limited evaluation to specific model families/quantization schemes."
    579     },
    580     "surprise_contrarian": {
    581       "score": 1,
    582       "justification": "Intuitive finding that some reasoning steps are harder than others; paper confirms rather than challenges conventional wisdom about step-level difficulty variation."
    583     },
    584     "fear_safety": {
    585       "score": 0,
    586       "justification": "No safety concerns raised or addressed; quantization and routing are standard efficiency techniques with no novel safety implications discussed."
    587     },
    588     "drama_conflict": {
    589       "score": 0,
    590       "justification": "Straightforward technical paper with no controversy, competing claims, or debate-worthy claims."
    591     },
    592     "demo_ability": {
    593       "score": 1,
    594       "justification": "Requires setup of ALFWorld environment, specific quantized model variants, and router training pipeline; not immediately accessible or demo-able without code release."
    595     },
    596     "brand_recognition": {
    597       "score": 1,
    598       "justification": "Authors from solid academic institutions (Arizona, Pittsburgh, UNC, UCF) but not from major AI labs (OpenAI, Meta, DeepMind, Anthropic, Google); limited brand recognition outside research community."
    599     }
    600   },
    601   "hn_data": {
    602     "threads": [],
    603     "top_points": 0,
    604     "total_points": 0,
    605     "total_comments": 0
    606   }
    607 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs