scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28632B)
      1 {
      2   "paper": {
      3     "title": "Effective LoRA Adapter Routing using Task Representations",
      4     "authors": [
      5       "Akash Dhasade",
      6       "Anne-Marie Kermarrec",
      7       "Igor Pavlovic",
      8       "Diana Petrescu",
      9       "Rafael Pires",
     10       "Mathis Randl",
     11       "Martijn de Vos"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv preprint",
     15     "arxiv_id": "2601.21795",
     16     "doi": "10.48550/arXiv.2601.21795"
     17   },
     18   "scan_version": 2,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "LORAUTER is a training-free framework that routes LoRA adapters via task representations rather than adapter characteristics, achieving 101.2% of oracle performance in-domain and outperforming the best baseline by 5.2 points out-of-domain on LLaMA2-7B. The framework scales to 1500+ HuggingFace adapters while maintaining competitive performance (85.7% vs 88.4% with curated 48-adapter pool). Successive Halving reduces adapter evaluation cost by over 2x compared to uniform selection. Ablations show both the task-level retrieval and weighted composition components independently contribute to performance gains.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No code repository URL, GitHub link, or archive is provided anywhere in the paper."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper uses publicly available FLANV2 datasets (Wei et al., 2022) and publicly available HuggingFace LoRA adapters. The specific adapter pool URL is provided (huggingface.co/models?other=base_model:adapter:meta-llama/Llama-2-7b-hf), and the sentence encoder URL is given (huggingface.co/Styxxxx/lora_retriever)."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions bfloat16 precision and model names (LLaMA2-7B/13B) but provides no requirements.txt, Dockerfile, or detailed dependency listing sufficient to recreate the environment."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions, README, or runnable scripts are described."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Main routing results (Figures 2-3, Tables 2-6) report only point estimates of normalized performance with no confidence intervals or error bars. Figure 10 shows standard deviation for SH over 100 runs, but this is for a supplementary analysis, not the core claims."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No statistical significance tests are used. Claims like 'LORAUTER outperforms LORARETRIEVER by +5.2 points' are based solely on comparing point estimates without any test (p-values, bootstrap, etc.)."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The paper reports percentage-point improvements with baseline context: '+5.2 percentage points' over LORARETRIEVER in OOD (88.4% vs 83.2%), '101.2% of oracle' in non-OOD, and '>2x reduction' in adapter evaluations via SH. Absolute and relative figures are consistently provided."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "50 test samples per task and at most 200 validation samples per task are used without any justification for why these sizes are sufficient. No power analysis is provided."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Main routing results (Tables 2, 3, 6) are single-point estimates with no variance or standard deviation across runs. The task representation construction involves random sampling of validation queries (Section 3.3), but the sensitivity of main results to this randomness is not assessed. Only the SH analysis (Figure 10) reports std dev over 100 runs."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Four baselines are compared: LORAHUB, LORARETRIEVER, ARROW, and SpectR, plus an Oracle task-aligned adapter (Section 4.1). All are evaluated under identical conditions."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All baselines are recent: MOLE (ICLR24), LORAHUB (COLM24), LORARETRIEVER (FindingsACL24), ARROW (ICML24), SpectR (2025). These represent the current state of the art in LoRA adapter routing."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Section 4.3 presents two ablation studies: (1) swapping retrieval and composition components between LORARETRIEVER and LORAUTER (Table 2), and (2) K=1 vs K=3 to show composition benefit (Table 3). Section 4.4 studies pseudo-task clusters. Table 10 ablates K values from 1-5."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Multiple task-specific metrics are used: Exact Match (EM) for NLU tasks, BLEU for translation, and ROUGE-1/2/L for structure-to-text generation (Section 4.1). Results are also reported as normalized averages."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "All evaluation is fully automated using EM, BLEU, and ROUGE metrics. No human evaluation of output quality is conducted."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 4.1 states: 'For the final evaluation, we use the same test set as Zhao et al. (2024), consisting of 50 held-out test samples for each of the 48 tasks.' Validation sets are explicitly separated and used only for routing/adapter selection."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Tables 6, 11, 12 provide per-domain breakdowns (struct-to-text, commonsense, sentiment, etc.) and Tables 11-18 in the appendix provide full per-task results for all 48 tasks across all methods and settings."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "While aggregate performance degradation in OOD settings is noted, no qualitative failure analysis is provided. The paper does not examine specific queries or tasks where LORAUTER fails or produces poor outputs, or explain why particular routing decisions go wrong."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Several negative results are reported: K=1 (no composition) significantly underperforms K=3 (Table 3); selection-based methods collapse in OOD (Section B.1, Table 4); too few or too many pseudo-task clusters hurt performance (Figure 4); and baselines like LORAHUB and SpectR substantially underperform."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims are supported: 'matching Oracle performance (101.2%)' is in Figure 2 and Table 6; '+5.2 points' OOD improvement over LORARETRIEVER matches Figure 2; scaling to 1500+ adapters is in Section 4.2 and Tables 7-8."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper makes causal claims through ablation studies in Section 4.3 (Table 2): swapping individual components (retrieval, composition) demonstrates their individual causal contributions. These are controlled single-variable manipulations adequate for the causal claims made."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper tests only on LLaMA2 (7B and 13B) and FLANV2 tasks but makes broad claims: 'a practical foundation for leveraging large, dynamic adapter pools for LoRA routing' (Section 6). The title 'Effective LoRA Adapter Routing' implies general applicability beyond the tested model family and task set."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No alternative explanations for the results are discussed. The paper does not consider confounds such as whether gains come from the specific task taxonomy used, the quality of the sentence encoder, or dataset-specific properties of FLANV2."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper's claims match the granularity of its measurements. It claims routing effectiveness and measures it via task-specific metrics (EM, BLEU, ROUGE) normalized by oracle performance. No broader framing gap exists between what is measured and what is claimed."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Specific model identifiers are provided: 'LLaMA 2-7B,13B (Touvron et al., 2023b)' with reference to the specific paper. The sentence encoder is identified by HuggingFace URL (huggingface.co/Styxxxx/lora_retriever)."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The embedding instruction is provided ('Represent the sentence for similar task retrieval', Section 3.3), but the actual task evaluation prompts used for each of the 48 FLANV2 tasks are not reproduced. The paper references Alpaca instruction format and FLANV2 verbalization but does not include the prompt text."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Key hyperparameters are reported in Section 4.1 and Appendix A: K=3 for top-K fusion, τ=0.2 softmax temperature, LoRA rank r=6, scaling α=12, bfloat16 precision, and at most 200 validation samples per task."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. LORAUTER is a routing and composition framework, not an agentic system."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 4.1 and Appendix A.2 describe the dataset setup: 48 tasks from FLANV2, adapter training with Alpaca format, validation/test splits, and HuggingFace adapter collection criteria (rank ≤64 for LLaMA2-7B, yielding 1567 adapters)."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No dedicated Limitations section exists. The Impact Statement discusses potential biases and misuse risks but does not address methodological limitations of the study itself."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No specific threats to validity are discussed. The Impact Statement mentions 'potential biases, privacy risks, or harmful behaviors' inherited from base models but these are generic deployment concerns, not study-specific methodological threats."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No explicit statements about what the results do NOT show. The paper does not acknowledge that results are limited to LLaMA2, FLANV2 tasks, or the specific adapter training configuration."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "While the input datasets (FLANV2) and adapters are publicly available, the experimental outputs (per-query predictions, adapter selections, routing scores) are not released for independent verification."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 4.1 describes adapter training (48 adapters, one per FLANV2 task, with r=6, α=12, Alpaca format). Section 4.2 describes HuggingFace adapter collection (all LLaMA2-7B adapters with rank ≤64). Appendix A.2 details the dataset taxonomy and formatting."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. Data sources are standard public benchmarks (FLANV2) and public adapter repositories (HuggingFace)."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The pipeline from task selection → adapter training → validation set construction → task representation building → evaluation is documented across Sections 3 and 4. Appendix A describes evaluation details including instruction formatting, data splits, and metric computation."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "All authors are listed as affiliated with EPFL, Lausanne, Switzerland. They are not evaluating a commercial product from their institution."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "Cannot assess funder independence since no funding is disclosed. The work is academic (EPFL) and does not evaluate a commercial product, but without funding disclosure, independence cannot be verified."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "LLaMA2's training data cutoff is not stated. The model was released in 2023 and could have been trained on data containing FLANV2 test examples (published 2022), but this is not discussed."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of whether FLANV2 test examples could have appeared in LLaMA2's pre-training data. While LoRA adapters are fine-tuned on training splits, the base model's exposure to test data is not addressed."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "FLANV2 was published in 2022 and LLaMA2 was trained on data collected through 2023, meaning benchmark examples were available online before training. This contamination risk is not discussed."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "Computational complexity is analyzed theoretically (O(T) vs O(N) vs O(NL) in Table 1) and evaluation budgets are compared for SH (Figure 5), but no actual wall-clock time, GPU hours, or monetary costs are reported for inference or routing."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No total computational budget is stated. The paper does not report GPU hours, total training time for the 48 adapters, or compute used for evaluation. Hardware specifications are also absent."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Main routing results are not reported across multiple random seeds. The SH analysis (Section 4.5) uses 100 runs with random validation subsets, but the core performance claims (Tables 2, 3, 6) appear to be from a single configuration without seed sensitivity analysis."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "For SH analysis, '100 independent runs' is stated. For the main routing results, the number of runs is not stated. Task representation construction involves random sampling of validation queries, but whether main results represent a single draw or average over multiple is not specified."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "While Table 10 (Appendix B.6) ablates K values 1-5, no overall hyperparameter search budget is reported. The softmax temperature τ=0.2 appears fixed without justification of how it was selected."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "K=3 is selected based on the ablation in Table 10, which shows K=2 and K=3 perform nearly identically (101.8% vs 101.6%), with K=3 chosen for consistency with baselines. The paper transparently states this rationale."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors evaluate their own system (LORAUTER) against baselines they re-implemented (ARROW, SpectR) and against results copied from original papers (LORARETRIEVER). They do not acknowledge or discuss the systematic bias of authors evaluating their own system, as documented by Lucic et al. (2018)."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "Figure 5 explicitly plots normalized performance as a function of total evaluations (compute budget) for SH vs uniform selection. Table 1 compares adapter selection overhead complexity (O(T) vs O(N) vs O(NL)) across all methods."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper uses FLANV2 with task-specific metrics and normalized averages but does not discuss whether this benchmark setup actually measures routing effectiveness, or whether normalized average is a valid aggregate measure of adapter routing quality."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is involved in the evaluation. LORAUTER is a routing framework, not an agentic scaffold."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "FLANV2 tasks were published in 2022 and LLaMA2 was trained on 2023 data. The model could have seen benchmark examples during pre-training. This temporal leakage is not discussed."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the evaluation setup leaks information. For instance, the text encoder used for routing could capture features that implicitly reveal the task identity beyond what a realistic user query would contain."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether train and test examples within FLANV2 tasks share structural similarities or whether adapters trained on related tasks create dependencies."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection or prevention method is applied (no canary strings, membership inference, n-gram overlap analysis, or decontamination)."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "LORAUTER achieves 101.2% of oracle task-aligned adapter performance in-domain (non-OOD) on LLaMA2-7B",
    373       "evidence": "Figure 2 and Table 6 show normalized average of 101.2% for LORAUTER vs 100% Oracle and 92.9% for LORARETRIEVER on LLaMA2-7B non-OOD setting.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "LORAUTER outperforms the best baseline by 5.2 percentage points in the OOD setting on LLaMA2-7B",
    378       "evidence": "Figure 2 shows 88.4% for LORAUTER vs 83.2% for LORARETRIEVER (best baseline) in OOD LLaMA2-7B. Per-task breakdown in Table 6 confirms consistent gains across domains.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "LORAUTER scales to 1500+ adapters from HuggingFace while maintaining competitive performance",
    383       "evidence": "Section 4.2 reports 85.7% OOD performance with 1567 HuggingFace adapters vs 88.4% with curated 48-adapter pool (Table 8). Only a 2.7pp drop despite 33x more adapters from uncontrolled sources.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Both retrieval and composition components individually contribute to LORAUTER's performance",
    388       "evidence": "Table 2 ablation: replacing only LORARETRIEVER's composition with LORAUTER's improves from 92.9% to 98.6%; replacing only retrieval improves to 96.7%. Both components independently add value.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Successive Halving reduces evaluation budget by more than 2x compared to uniform selection",
    393       "evidence": "Figure 5 shows SH reaches near-peak performance at approximately half the evaluation budget of uniform selection. Figure 10 confirms with standard deviation across 100 runs.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "LORAUTER works without ground-truth task labels using K-Means pseudo-task clusters",
    398       "evidence": "Figure 4 and Section 4.4 show that at optimal K, pseudo-task routing achieves up to 101.5% (7B non-OOD) and 88.0% (7B OOD), comparable to routing with true task labels.",
    399       "supported": "moderate"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "No error bars on main results",
    405       "detail": "All core routing performance numbers (Figures 2-3, Tables 2-6) are single point estimates. With only 50 test samples per task, random variance could be substantial, but no uncertainty quantification is provided for the central claims."
    406     },
    407     {
    408       "flag": "Small test set per task",
    409       "detail": "Only 50 test samples per task. For some tasks with high variance (e.g., DROP-EM ranges from 6% to 40%), the 5.2pp improvement claim could be within noise without statistical testing."
    410     },
    411     {
    412       "flag": "No limitations section",
    413       "detail": "The paper lacks any discussion of methodological limitations, scope boundaries, or threats to validity. Testing only LLaMA2 on FLANV2 tasks with a specific adapter training setup is a narrow evaluation basis for broad claims about 'effective LoRA adapter routing.'"
    414     },
    415     {
    416       "flag": "Single model family tested",
    417       "detail": "All experiments use LLaMA2 (7B and 13B variants). No evidence that LORAUTER works on other architectures (Mistral, Gemma, etc.) or with adapters trained with different PEFT methods."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "LoRA: Low-rank adaptation of large language models",
    423       "authors": ["E. J. Hu", "Yelong Shen", "P. Wallis", "Z. Allen-Zhu", "Y. Li", "S. Wang", "L. Wang", "W. Chen"],
    424       "year": 2022,
    425       "relevance": "Foundational PEFT method enabling modular adapter-based LLM specialization, central to the adapter routing problem addressed here."
    426     },
    427     {
    428       "title": "LoraRetriever: Input-aware LoRA retrieval and composition for mixed tasks in the wild",
    429       "authors": ["Z. Zhao", "L. Gan", "G. Wang", "W. Zhou", "H. Yang", "K. Kuang", "F. Wu"],
    430       "year": 2024,
    431       "relevance": "Primary baseline for LoRA adapter retrieval and composition; establishes the mixed-task evaluation benchmark used in this paper."
    432     },
    433     {
    434       "title": "Lorahub: Efficient cross-task generalization via dynamic LoRA composition",
    435       "authors": ["C. Huang", "Q. Liu", "B. Y. Lin", "T. Pang", "C. Du", "M. Lin"],
    436       "year": 2024,
    437       "relevance": "Adapter fusion method that learns task-specific mixture weights; baseline approach for LoRA composition."
    438     },
    439     {
    440       "title": "Towards modular LLMs by building and reusing a library of LoRAs",
    441       "authors": ["O. Ostapenko", "Z. Su", "E. M. Ponti", "L. Charlin", "N. Le Roux", "L. Caccia", "A. Sordoni"],
    442       "year": 2024,
    443       "relevance": "ARROW spectral routing method for LoRA adapters; training-free baseline using SVD-based adapter representations."
    444     },
    445     {
    446       "title": "Mixture of LoRA experts",
    447       "authors": ["X. Wu", "S. Huang", "F. Wei"],
    448       "year": 2024,
    449       "relevance": "MoE-style approach treating LoRA adapters as experts with learned gating; represents the trained-router paradigm contrasted with LORAUTER's training-free approach."
    450     },
    451     {
    452       "title": "SpectR: Dynamically composing LM experts with spectral routing",
    453       "authors": ["W. Fleshman", "B. Van Durme"],
    454       "year": 2025,
    455       "relevance": "Eigenspace-based spectral routing variant for dynamically composing language model experts."
    456     },
    457     {
    458       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    459       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    460       "year": 2024,
    461       "relevance": "Model routing approach for cost-efficient LLM inference, representative of the broader model selection problem."
    462     },
    463     {
    464       "title": "Automix: Automatically mixing language models",
    465       "authors": ["P. Aggarwal", "A. Madaan", "A. Anand"],
    466       "year": 2024,
    467       "relevance": "Automatic language model mixing method using early exiting for efficient inference routing."
    468     },
    469     {
    470       "title": "MixLLM: Dynamic routing in mixed large language models",
    471       "authors": ["X. Wang", "Y. Liu", "W. Cheng", "X. Zhao", "Z. Chen", "W. Yu", "Y. Fu", "H. Chen"],
    472       "year": 2025,
    473       "relevance": "Dynamic model routing approach distinguishing predictive vs non-predictive routing strategies for LLM inference."
    474     },
    475     {
    476       "title": "ICL-Router: In-context learned model representations for LLM routing",
    477       "authors": ["C. Wang", "H. Li", "Y. Zhang", "L. Chen", "J. Chen", "P. Jian", "P. Ye", "Q. Zhang", "S. Hu"],
    478       "year": 2025,
    479       "relevance": "Routing framework using learned model capability embeddings for dynamic LLM selection."
    480     },
    481     {
    482       "title": "Few-shot parameter-efficient fine-tuning is better and cheaper than in-context learning",
    483       "authors": ["H. Liu", "D. Tam", "M. Muqeeth", "J. Mohta", "T. Huang", "M. Bansal", "C. Raffel"],
    484       "year": 2022,
    485       "relevance": "Establishes that PEFT methods can outperform in-context learning while being more cost-effective."
    486     }
    487   ]
    488 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs