scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (23930B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "HAPS: Hierarchical LLM Routing with Joint Architecture and Parameter Search",
      6     "authors": [
      7       "Zihang Tian",
      8       "Rui Li",
      9       "Jingsen Zhang",
     10       "Xiaohe Bo",
     11       "Wei Huo"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2601.05903",
     16     "doi": "10.48550/arXiv.2601.05903"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims HAPS 'consistently outperforms strong routing baselines' on two benchmarks; Table 2 confirms best performance in 5 of 6 open-source pair settings, and code is released at the stated GitHub URL.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Ablation studies (Tables 3, Figure 3, Figure 4) systematically isolate the high-level router, low-level router, and parameter sharing, providing adequate within-system causal evidence for each component's contribution.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The conclusion calls HAPS 'a promising and practical direction for building efficient, adaptive AI systems' based on 100 test instances across two benchmarks and pairwise model configurations; the scope of the claim exceeds what these narrow evaluations can support.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not consider whether improvements stem from LoRA fine-tuning on the training distribution rather than routing per se, nor whether the Teacher-Student scaffold itself drives gains independent of routing.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Token-level F1 for HotpotQA and exact-match accuracy for MMLU are used as direct measures of the claimed 'task performance'; the paper does not conflate these with broader capability claims.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "A dedicated 'Limitations' section is present at the end of the main paper listing four distinct limitations.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Three of the four stated limitations are generic ('effectiveness may vary when task distribution changes'; 'reward signals can be noisy'; 'reproducibility may be limited'); none name specific confounds such as the 100-instance test set size or distribution shift from exhaustive enumeration training.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The limitations note the method was evaluated on 'a specific set of tasks, model candidates, and cost settings' but do not explicitly bound what the results do NOT show (e.g., no claims about multi-hop reasoning beyond 2-agent pairs or non-pairwise routing pools).",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment or grant number is present anywhere in the paper; only institutional affiliations are listed.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly stated: four authors at Renmin University of China and Wei Huo at Huawei Technologies Wireless Technology Lab.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "One author (Wei Huo) is from Huawei Technologies, an organization with commercial interest in efficient LLM deployment; no independence statement or COI disclosure is made.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "There is no competing interests or financial disclosure statement anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "LLM routing is formally defined in Section 2 with a mathematical objective; the distinction between 'architecture selection' (discrete) and 'parameter search' (continuous) is clearly explained with a motivating example in Figure 1.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly states three contributions: the idea of joint architecture+parameter search, the hierarchical implementation with parameter generation network, and the experimental validation.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 5 engages with RouteLLM, GraphRouter, IRT-Router, cascade systems, and PEFT literature, explaining how HAPS bridges discrete routing and continuous adaptation in a way prior work does not.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The abstract states 'We have released our code at https://github.com/zihangtian/HAPS', indicating an actual public release.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Both evaluation datasets (HotpotQA and MMLU) are standard public benchmarks used unmodified.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements.txt, Dockerfile, or dependency specification is provided; the paper describes model architectures but not the software environment needed to run them.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "Appendices B–D provide detailed Algorithms 1–3, full agent prompts (Figures 6–14), data split construction, and baseline adaptation procedures sufficient to reproduce the pipeline alongside the released code.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 2, 3, and 5 and Figures 3–5 are single point estimates with no confidence intervals, standard deviations, or error bars across any runs.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are reported for any comparisons; margins as small as 1 F1 point on a 100-instance test set are described as 'consistent' improvements without testing.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Absolute improvement margins are reported throughout (e.g., '1.85%, 3.60%, and 1.63% in F1 score' over the runner-up), providing effect size context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The 100-instance test set is stated in Appendix C without justification or power analysis; no rationale is given for why this size is sufficient for reliable comparisons.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or results across multiple runs are reported; it is unclear whether experiments were run once or averaged.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Four baselines are compared: Random, RouteLLM, GraphRouter, and IRT-Router, covering diverse routing paradigms.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "RouteLLM (2024), GraphRouter (2024), and IRT-Router (2025) are recent and represent the state of the art in the LLM routing literature.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Comprehensive ablation covers removal of the high-level router (random or fixed variants), removal of the low-level router, removal of parameter sharing, and variation in LoRA injection depth.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Token-level F1 (HotpotQA) and accuracy (MMLU) are used for quality; normalized cost and a composite reward are used in the trade-off analysis (Section 4.6).",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Automated benchmarks with objective labels (HotpotQA F1, MMLU accuracy) make human evaluation of outputs not applicable here.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Appendix C explicitly states Dtest (100 instances) 'is strictly reserved for final evaluation of the complete routing framework' and is not used during training or validation.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "MMLU covers 57 subjects but no per-subject accuracy breakdown is provided; HotpotQA results are also reported only as aggregate F1.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "No failure cases or error analysis are presented; the paper only discusses where HAPS performs best, not where it fails.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Figure 4 shows LoRA depth l3 degrades performance for L-Q and L-M pairs relative to l2, reporting the non-monotonic relationship as a finding rather than hiding it.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific versioned model names are given: Llama-3.1-8B-Instruct, Qwen2.5-7B-Instruct, Mistral-7B-Instruct-v0.3, Llama-3.2-1B-Instruct, GPT-4.1 Nano, DeepSeek V3.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Full verbatim prompts for all agent roles on both benchmarks are provided in Appendix Figures 6–14, including system prompts and all template fields.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "Only scaling factor α=0.01 and LoRA rank r=8 are reported; learning rates ηsft, ηθ, ηϕ appear as symbols in Algorithm 3 but their values are never given, and training epochs/batch sizes are also absent.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The two-agent Teacher-Student framework, interaction protocols, action spaces, and context management for both HotpotQA and MMLU are described in detail across Appendices A and B with formal algorithms.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Appendix C documents stratified sampling over MMLU subjects, sampling from hotpot_dev_distractor_v1.json for HotpotQA, and the exhaustive enumeration procedure for constructing the action-level training dataset.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The preprocessed train/val/test splits (3000/1000/100 instances) and the exhaustively-enumerated action-reward dataset are not separately published; availability depends on the code repository.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Appendix C describes dataset sourcing, split sizes, stratified sampling for MMLU, and the procedure for constructing the action-level dataset via exhaustive enumeration of routing decisions.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; standard public benchmarks are used.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full pipeline from raw benchmark sampling through SFT warm-up dataset construction, RL dataset construction, and test set isolation is documented in Appendix C and Algorithm 3.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "The paper evaluates LLMs on MMLU (2020) and HotpotQA (2018), both of which predate and are likely included in training corpora of Llama-3.1, Qwen2.5, and Mistral; no training cutoff is stated for any model.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Potential contamination of MMLU or HotpotQA examples in the base LLMs' pretraining data is not discussed anywhere in the paper.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "Neither benchmark was created after the models' training cutoffs; MMLU (2020) and HotpotQA (2018) are well-known benchmarks highly likely to appear in pretraining data, but this is not acknowledged.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants; the ethics section notes no PII is collected.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Section 4.6 provides an explicit performance-cost trade-off analysis with normalized costs, pricing tables (Table 4), and cost values for all methods under three budget regimes.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No GPU hours, training time, or total compute budget for training HAPS is reported anywhere in the paper.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "HAPS achieves state-of-the-art performance in 5 of 6 open-source pair settings on HotpotQA and MMLU",
    375       "evidence": "Table 2 shows HAPS achieves best F1/accuracy in L-Q, M-Q, L-M HotpotQA and M-Q, L-M MMLU; ties on L-Q MMLU at 79%",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Parameter sharing between high-level and low-level routers is necessary, contributing up to 4.33% F1 gain",
    380       "evidence": "Figure 3 shows decoupled variant (w/o Parameter Sharing) drops HotpotQA F1 by 1.64–4.33% and MMLU accuracy by 3–6% across all pairs",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Both high-level architecture routing and low-level parameter generation are individually necessary",
    385       "evidence": "Table 3 ablations show removing the high-level router (random) drops F1 by up to 15%, and removing the low-level router degrades performance across all settings",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "HAPS achieves better performance-cost trade-offs than baselines, reducing cost by 58% vs RouteLLM while maintaining comparable F1",
    390       "evidence": "Table 5 shows HAPS achieves cost 0.0886 vs RouteLLM's 0.2111 in performance-first regime while F1 is 41.94% vs 43.35%",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "HAPS extends naturally to mixed open/closed-source settings by blocking gradient propagation for proprietary APIs",
    395       "evidence": "Figure 5 shows HAPS outperforms all baselines on L-G (47.19% F1) and Q-D (58.52% F1) mixed-source pairs on HotpotQA",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "methodology_tags": [
    400     "benchmark-eval"
    401   ],
    402   "key_findings": "HAPS proposes a hierarchical LLM routing framework that jointly searches discrete model architectures and continuous LoRA parameters, outperforming four routing baselines on HotpotQA (F1) and MMLU (accuracy) across multiple model-pair configurations. Parameter sharing between the high-level routing classifier and low-level parameter generation network is a critical design choice, contributing 1.64–4.33% F1 on HotpotQA when ablated. The framework also achieves favourable performance-cost trade-offs by generating concise routing and reducing token consumption, and extends to mixed open/closed-source settings by blocking gradient flow for proprietary APIs. All improvements are measured on 100-instance test sets without statistical significance testing, limiting confidence in the reported margins.",
    403   "red_flags": [
    404     {
    405       "flag": "Tiny test set",
    406       "detail": "Test sets contain only 100 instances per benchmark; margins of 1–3 F1 points are reported as 'consistent' improvements without any statistical significance testing or error bars."
    407     },
    408     {
    409       "flag": "No variance reported",
    410       "detail": "All results are single point estimates; no standard deviation, confidence intervals, or multiple-run variance is reported anywhere in the paper."
    411     },
    412     {
    413       "flag": "Benchmark contamination unaddressed",
    414       "detail": "MMLU (2020) and HotpotQA (2018) predate all base models used; the possibility that benchmark examples appear in pretraining data is not discussed."
    415     },
    416     {
    417       "flag": "Missing hyperparameters",
    418       "detail": "Learning rates ηsft, ηθ, ηϕ appear as symbols in Algorithm 3 but their actual values are never reported; training epochs and batch sizes are also absent."
    419     },
    420     {
    421       "flag": "No per-category breakdown",
    422       "detail": "MMLU covers 57 subjects but only aggregate accuracy is reported; subgroup variation cannot be assessed."
    423     },
    424     {
    425       "flag": "Huawei co-authorship undisclosed",
    426       "detail": "One author is from Huawei Technologies, a commercial AI stakeholder, but no funding disclosure or competing interests statement is made."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "RouteLLM: Learning to Route LLMs from Preference Data",
    432       "relevance": "Primary routing baseline and inspiration for preference-based binary routing"
    433     },
    434     {
    435       "title": "GraphRouter: A Graph-based Router for LLM Selections",
    436       "relevance": "Heterogeneous graph routing baseline"
    437     },
    438     {
    439       "title": "IRT-Router: Effective and Interpretable Multi-LLM Routing via Item Response Theory",
    440       "relevance": "Contemporary interpretable routing baseline with difficulty modeling"
    441     },
    442     {
    443       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    444       "relevance": "Foundation for the parameter generation component of HAPS"
    445     },
    446     {
    447       "title": "HotpotQA: A Dataset for Diverse, Explainable Multi-Hop Question Answering",
    448       "relevance": "Primary multi-hop reasoning evaluation benchmark"
    449     },
    450     {
    451       "title": "Measuring Massive Multitask Language Understanding (MMLU)",
    452       "relevance": "Primary broad-knowledge evaluation benchmark"
    453     },
    454     {
    455       "title": "HyperNetworks",
    456       "relevance": "Architectural basis for the parameter generation network design"
    457     },
    458     {
    459       "title": "RouterBench: A Benchmark for Multi-LLM Routing System",
    460       "relevance": "Related work on routing evaluation infrastructure"
    461     }
    462   ],
    463   "engagement_factors": {
    464     "practical_relevance": {
    465       "score": 2,
    466       "justification": "LLM routing for cost-performance trade-offs is directly applicable to production deployments, and code is released."
    467     },
    468     "surprise_contrarian": {
    469       "score": 1,
    470       "justification": "The joint architecture+parameter framing is a modest but not surprising extension of existing routing work."
    471     },
    472     "fear_safety": {
    473       "score": 0,
    474       "justification": "No safety or risk concerns are raised; the paper is purely a performance optimization contribution."
    475     },
    476     "drama_conflict": {
    477       "score": 0,
    478       "justification": "No controversy or conflicting claims with established work."
    479     },
    480     "demo_ability": {
    481       "score": 2,
    482       "justification": "Code is publicly released on GitHub; practitioners could in principle reproduce the routing setup on public benchmarks."
    483     },
    484     "brand_recognition": {
    485       "score": 1,
    486       "justification": "Huawei affiliation is notable but the paper is not from a marquee AI lab; venue is arXiv preprint."
    487     }
    488   },
    489   "hn_data": {
    490     "threads": [],
    491     "top_points": 0,
    492     "total_points": 0,
    493     "total_comments": 0
    494   }
    495 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs