scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28667B)
      1 {
      2   "paper": {
      3     "title": "HAPS: Hierarchical LLM Routing with Joint Architecture and Parameter Search",
      4     "authors": [
      5       "Zihang Tian",
      6       "Rui Li",
      7       "Jingsen Zhang",
      8       "Xiaohe Bo",
      9       "Wei Huo",
     10       "Xu Chen"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2601.05903",
     15     "doi": "10.48550/arXiv.2601.05903"
     16   },
     17   "scan_version": 2,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "HAPS introduces hierarchical LLM routing that jointly searches over model architectures and parameters via a high-level classifier and low-level LoRA parameter generator with shared backbone. On HotpotQA and MMLU across three open-source model pairs, HAPS achieves state-of-the-art in 5/6 settings with F1 improvements of 1.6-3.6% over the best baseline. Parameter sharing between router levels is shown to be essential (up to 6pp accuracy drop without it), and the framework extends to mixed open/closed-source settings.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The abstract states: 'We have released our code at https://github.com/zihangtian/HAPS.' A concrete URL is provided."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper uses publicly available benchmarks: HotpotQA (Yang et al., 2018) and MMLU (Hendrycks et al., 2020). Both are standard public datasets."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed dependency listing is provided in the paper. Implementation details mention model names and hyperparameters but not software environment specifications."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are included in the paper. The code repository URL is provided but the paper itself contains no README-style instructions for reproducing experiments."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Tables 2, 3, and 5 report only point estimates (e.g., 43.16 F1, 79 Acc). No confidence intervals, error bars, or ± notation appear anywhere in the results."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims 'HAPS consistently outperforms strong routing baselines' and reports absolute margins (1.85%, 3.60%), but no statistical significance tests (p-values, t-tests, bootstrap tests) are performed on any comparison."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 4.2 reports absolute improvement margins with baseline context: 'outperforms the runner-up by absolute margins of 1.85%, 3.60%, and 1.63% in F1 score on the L-Q, M-Q, and L-M pairs.' Table 2 provides full scores for all methods, making effect sizes computable."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Appendix C states 3,000 train, 1,000 valid, and 100 test instances. No justification is given for why these sizes were chosen, nor is any power analysis discussed. The 100-instance test set is notably small for the margins claimed."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No standard deviations, variance across runs, or spread measures are reported. All results tables show single-run point estimates."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Section 4.1 lists four baselines: Random, RouteLLM (Ong et al., 2024), GraphRouter (Feng et al., 2024), and IRTRouter (Song et al., 2025). All are compared in Table 2."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All baselines are very recent: RouteLLM (ICLR 2024), GraphRouter (2024), and IRT-Router (2025). These represent the current state of the art in LLM routing."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Table 3 (Section 4.3) provides ablation studies removing the high-level router (random/fixed variants) and the low-level router. Figure 3 (Section 4.4) ablates parameter sharing. Figure 4 (Section 4.5) varies LoRA depth."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "The paper uses token-level F1 for HotpotQA and accuracy for MMLU (Section 4.1). Section 4.6 additionally evaluates performance, cost, and combined reward metrics."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No human evaluation is conducted. All evaluation relies on automated metrics (F1, accuracy). Human evaluation of routing decisions or output quality could have complemented automated metrics."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Appendix C explicitly states: 'The held-out Dtest (100 instances) is strictly reserved for final evaluation of the complete routing framework.' Separate train/valid/test splits are described."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down per candidate model pair (L-Q, M-Q, L-M) across both datasets in Tables 2, 3, and 5. The cost-performance trade-off is shown across three regimes. Figure 5 shows mixed-source pair breakdowns."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "No qualitative error analysis or examples of routing failures are provided. The paper does not examine cases where HAPS made incorrect routing decisions or where the generated LoRA parameters hurt performance."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Figure 4 (Section 4.5) shows that l3 LoRA depth degrades performance for L-Q and L-M pairs. Table 3 shows fixed assignment configurations that severely underperform. These are genuine negative results."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims HAPS 'consistently outperforms strong routing baselines.' Table 2 shows HAPS achieves the best result in 5/6 settings and ties in the 6th (L-Q MMLU at 79%), supporting this claim."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Causal claims are made through ablation studies: removing the high-level router, low-level router, and parameter sharing each degrade performance (Table 3, Figure 3). These controlled single-variable ablations adequately support the causal structure."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The abstract specifies 'two commonly used benchmarks.' The Limitations section explicitly states: 'the proposed router is evaluated under a specific set of tasks, model candidates, and cost settings; its effectiveness may vary when the task distribution, candidate pool, or budget constraints change.'"
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No alternative explanations for the observed improvements are discussed. The paper does not consider whether gains could be due to overfitting on the small 100-instance test set, favorable benchmark characteristics, or other confounds beyond component ablation."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper claims routing effectiveness and measures it directly via F1 and accuracy on specific benchmarks. Claims match the granularity of measurements — no broader framing beyond benchmark performance is introduced."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Table 1 lists specific model versions: Llama-3.1-8B-Instruct, Qwen2.5-7B-Instruct, Mistral-7B-Instruct-v0.3, Llama-3.2-1B-Instruct (router). GPT-4.1 Nano and DeepSeek V3 are named with the latter citing its technical report."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Figures 6-14 in the appendix provide full prompt text for Teacher and Student agents on both HotpotQA and MMLU, plus system and user prompts for routing. The actual prompt templates with variable placeholders are provided with clear variable names."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "Training hyperparameters are partially reported: LoRA rank=8, scaling factor α=0.01, MLP 256 hidden units (Section 4.1). However, LLM inference parameters (temperature, top-p, sampling strategy) for the candidate models are not stated, which significantly affect output quality."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The Teacher-Student multi-agent scaffolding is described in detail. Appendix B provides full protocols for both HotpotQA (Algorithm 1: search/finish actions, continue/rethink feedback, 3-turn budget) and MMLU (Algorithm 2: K-round revision with submit action)."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Appendix C documents the full pipeline: downsampling to 3,000/1,000/100 splits, source splits for each benchmark (hotpot_dev_distractor_v1.json, stratified MMLU sampling), action-level dataset expansion via exhaustive enumeration, SFT and RL dataset construction."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "A dedicated 'Limitations' section appears after Section 6 (Conclusion), containing four substantive limitation points covering generalizability, reward signal noise, PEFT constraints, and API reproducibility."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The limitations are specific to this study: (1) effectiveness may vary with different task distributions and candidate pools, (2) reward signals can be noisy affecting training stability, (3) PEFT may cap performance compared to full-parameter adaptation, (4) proprietary API non-determinism limits reproducibility."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The Limitations section states: 'the proposed router is evaluated under a specific set of tasks, model candidates, and cost settings; its effectiveness may vary when the task distribution, candidate pool, or budget constraints change.' This names specific untested dimensions."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "While HotpotQA and MMLU are public benchmarks, the specific experimental data — downsampled splits, action-level datasets with exhaustive reward labels, and RL training subsets — are not explicitly released or linked in the paper."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Appendix C describes data collection in detail: sampling procedure from source splits, exhaustive enumeration of routing actions, reward computation for each action, and construction of SFT warm-up and RL datasets with specific sizes."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. Data comes from standard public benchmarks (HotpotQA, MMLU)."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Appendix C documents the pipeline: benchmark sampling → action-level expansion (Eq. 8, |Aact|=4 per problem) → SFT dataset via arg max reward selection → RL subset (300 train, 100 valid). Algorithm 3 describes the training pipeline."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding acknowledgment or grant information appears in the paper. One author is from Huawei Technologies, suggesting potential corporate support, but no funding source is disclosed."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are listed on the first page: Renmin University of China and Wireless Technology Lab, Huawei Technologies Co., Ltd. The Huawei affiliation is clearly stated."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding is disclosed despite a Huawei affiliation. The paper does not evaluate Huawei products directly, but without funding transparency, independence cannot be verified."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests statement or financial disclosure appears in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No training data cutoff dates are stated for any of the candidate models (Llama-3.1, Qwen2.5, Mistral, GPT-4.1 Nano, DeepSeek V3). These models are evaluated on benchmarks that predate their training."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of whether HotpotQA (2018) or MMLU (2020) examples appeared in the training data of models released in 2024-2025. These are widely-used benchmarks with high contamination risk."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "HotpotQA (2018) and MMLU (2020) were published years before the training cutoffs of all candidate models. Both benchmarks are extensively available online. No contamination analysis or discussion is provided."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study. The paper evaluates LLM routing on automated benchmarks."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants. The Ethical Considerations section confirms: 'we do not collect new data from human subjects.'"
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": true,
    291         "justification": "Section 4.6 and Table 5 report normalized inference cost across three trade-off regimes. Table 4 provides the cost configuration. Appendix E discusses inference efficiency via vectorized LoRA injection and request bucketing."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No total GPU hours, training time, or computational budget is reported. The two-phase training (SFT + RL) and exhaustive action enumeration require significant compute, but this is not quantified."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No results across multiple random seeds are reported. All tables show single-run point estimates without any seed sensitivity analysis."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is not stated. Results appear to be from single runs, but this is neither confirmed nor denied."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "Hyperparameters are described as 'empirically set' (Section 4.1: α=0.01, LoRA rank r=8), but no search budget, number of configurations tried, or search method is reported."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The paper does not explain how the final hyperparameters (α=0.01, rank=8, l2 depth) were selected. The LoRA depth study (Figure 4) shows all configurations but doesn't describe validation-based selection."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "No statistical tests are performed at all, making multiple comparison correction impossible. The paper makes numerous comparative claims across 6 settings × 5 methods without any correction."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "No discussion of author-evaluation bias. Appendix C notes that baseline 'architectures and training procedures' are unchanged, but the authors re-implement baselines with 'minimal modifications required by our two-agent routing setup' without acknowledging the bias this introduces."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "HAPS requires additional training (SFT + RL phases, exhaustive action enumeration, parameter generation networks) compared to baselines, but performance is not reported as a function of compute budget. The added compute cost of HAPS vs baselines is not compared."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No discussion of whether HotpotQA F1 and MMLU accuracy adequately measure the routing capabilities being claimed. The paper assumes these benchmarks are valid proxies without questioning construct validity."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "The Teacher-Student scaffolding is held constant across all methods. Section 4.1 and Appendix C confirm all baselines use the same two-agent environment, isolating routing method as the variable."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "HotpotQA (2018) and MMLU (2020) were published years before all candidate models were trained. No discussion of whether models have seen benchmark solutions during training."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the evaluation setup leaks information. The routing prompt includes task descriptions that could provide hints beyond what would be available in deployment."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of independence between the downsampled train/valid/test splits or whether the benchmark examples share structural similarities that could inflate results."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination is mentioned."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "HAPS achieves state-of-the-art performance in 5 out of 6 settings across HotpotQA and MMLU, outperforming the best baseline by 1.85-3.60% F1 on HotpotQA and 1% accuracy on MMLU.",
    372       "evidence": "Table 2 (Section 4.2) shows HAPS achieving highest scores in 5/6 settings: 43.16/39.70/40.58 F1 on HotpotQA and 79/78/74 accuracy on MMLU across L-Q, M-Q, L-M pairs.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "Both hierarchical components (high-level architecture router and low-level parameter generator) are indispensable for performance.",
    377       "evidence": "Table 3 (Section 4.3) shows removing high-level router drops F1 by up to 15% (random) or 30%+ (mismatched fixed assignments). Removing low-level router drops F1 by 4-9% and accuracy by 4-9%.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Parameter sharing between high- and low-level routers is essential, with decoupling causing up to 4.33% F1 and 6% accuracy degradation.",
    382       "evidence": "Figure 3 (Section 4.4) compares HAPS with a decoupled variant across all settings, showing consistent drops: HotpotQA F1 drops of 1.64%, 3.75%, 4.33% and MMLU accuracy drops of 6%, 4%, 3%.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "HAPS effectively balances performance-cost trade-off, achieving the highest reward and lowest cost across all regimes.",
    387       "evidence": "Table 5 (Section 4.6) shows HAPS achieves highest reward in all three regimes (0.3178, 0.1426, 0.0148). In the Cost-First setting, HAPS maintains 41.81% F1 at 0.0861 cost while baselines drop to ~35% F1.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "HAPS extends to mixed open/closed-source LLM routing, outperforming the best baseline by 3.45% F1 (L-G) and 0.54% F1 (Q-D).",
    392       "evidence": "Figure 5 (Section 4.7) shows HAPS achieves 47.19% F1 on L-G (vs 43.74% RouteLLM) and 58.52% on Q-D (vs 57.98% RouteLLM) on HotpotQA.",
    393       "supported": "weak"
    394     }
    395   ],
    396   "red_flags": [
    397     {
    398       "flag": "Very small test set",
    399       "detail": "The test set contains only 100 instances per benchmark (Appendix C). Claimed improvements of 1-4% F1 could easily be within sampling noise at this sample size, yet no significance tests or confidence intervals are provided."
    400     },
    401     {
    402       "flag": "No error bars or statistical testing",
    403       "detail": "No results across the paper include variance, standard deviations, confidence intervals, or significance tests. All comparisons are based on single-run point estimates, making it impossible to assess whether differences are statistically meaningful."
    404     },
    405     {
    406       "flag": "Benchmark contamination risk ignored",
    407       "detail": "HotpotQA (2018) and MMLU (2020) are old, widely-used benchmarks. All candidate models (Llama-3.1, Qwen2.5, Mistral, etc.) were trained after these benchmarks were public. Contamination could differentially affect models, confounding routing comparisons."
    408     },
    409     {
    410       "flag": "Missing funding and conflict disclosure",
    411       "detail": "One author is from Huawei Technologies, but no funding source, competing interests, or financial disclosure is provided."
    412     },
    413     {
    414       "flag": "Baseline re-implementation with modifications",
    415       "detail": "Appendix C states baselines required 'minimal modifications' for the two-agent setup. Self-implemented baselines may systematically underperform original implementations (Lucic et al., 2018), and the authors do not acknowledge this bias."
    416     }
    417   ],
    418   "cited_papers": [
    419     {
    420       "title": "RouteLLM: Learning to Route LLMs from Preference Data",
    421       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu", "Wei-Lin Chiang"],
    422       "year": 2024,
    423       "relevance": "Primary baseline for LLM routing using preference-based binary routing."
    424     },
    425     {
    426       "title": "GraphRouter: A Graph-Based Router for LLM Selections",
    427       "authors": ["Tao Feng", "Yanzhen Shen", "Jiaxuan You"],
    428       "year": 2024,
    429       "arxiv_id": "2410.03834",
    430       "relevance": "Graph-based LLM routing baseline using heterogeneous graph prediction."
    431     },
    432     {
    433       "title": "IRT-Router: Effective and Interpretable Multi-LLM Routing via Item Response Theory",
    434       "authors": ["Wei Song", "Zhenya Huang", "Cheng Cheng"],
    435       "year": 2025,
    436       "arxiv_id": "2506.01048",
    437       "relevance": "Interpretable routing baseline using item response theory for query difficulty modeling."
    438     },
    439     {
    440       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    441       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    442       "year": 2023,
    443       "arxiv_id": "2305.05176",
    444       "relevance": "Cascade-based LLM routing for cost-performance optimization."
    445     },
    446     {
    447       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    448       "authors": ["Edward J Hu", "Yelong Shen", "Phillip Wallis"],
    449       "year": 2022,
    450       "relevance": "Foundation for the parameter-efficient fine-tuning approach used in HAPS's low-level router."
    451     },
    452     {
    453       "title": "QLoRA: Efficient Finetuning of Quantized LLMs",
    454       "authors": ["Tim Dettmers", "Artidoro Pagnoni", "Ari Holtzman", "Luke Zettlemoyer"],
    455       "year": 2023,
    456       "relevance": "Quantized efficient fine-tuning approach related to parameter-efficient LLM adaptation."
    457     },
    458     {
    459       "title": "Language Models are Few-Shot Learners",
    460       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    461       "year": 2020,
    462       "relevance": "Foundational GPT-3 paper establishing LLM capability landscape that motivates routing."
    463     },
    464     {
    465       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    466       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
    467       "year": 2022,
    468       "relevance": "Reasoning-acting paradigm that inspired the Teacher-Student multi-agent framework used in HAPS."
    469     },
    470     {
    471       "title": "RouterBench: A Benchmark for Multi-LLM Routing System",
    472       "authors": ["Qitian Jason Hu", "Jacob Bieker", "Xiuyu Li"],
    473       "year": 2024,
    474       "arxiv_id": "2403.12031",
    475       "relevance": "Benchmarking framework for evaluating LLM routing systems."
    476     },
    477     {
    478       "title": "DeepSeek-V3 Technical Report",
    479       "authors": ["Aixin Liu", "Bei Feng", "Bing Xue"],
    480       "year": 2024,
    481       "arxiv_id": "2412.19437",
    482       "relevance": "Technical report for DeepSeek V3, one of the candidate LLMs used in mixed-source experiments."
    483     },
    484     {
    485       "title": "Reflective Multi-Agent Collaboration Based on Large Language Models",
    486       "authors": ["Xiaohe Bo", "Zeyu Zhang", "Quanyu Dai"],
    487       "year": 2024,
    488       "relevance": "Multi-agent LLM collaboration framework that informed the Teacher-Student evaluation setup."
    489     },
    490     {
    491       "title": "TensorOpera Router: A Multi-Model Router for Efficient LLM Inference",
    492       "authors": ["Dimitris Stripelis", "Zijian Hu", "Jipeng Zhang"],
    493       "year": 2024,
    494       "arxiv_id": "2408.12320",
    495       "relevance": "Multi-model routing for efficient LLM serving, related work in the routing space."
    496     },
    497     {
    498       "title": "HyperLoRA: Efficient Cross-Task Generalization via Constrained Low-Rank Adapters Generation",
    499       "authors": ["Chuancheng Lv", "Lei Li", "Shitou Zhang"],
    500       "year": 2024,
    501       "relevance": "Hypernetwork-based LoRA generation for cross-task transfer, directly related to HAPS's parameter generation approach."
    502     }
    503   ]
    504 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs