scan-v5.json (24299B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Effective LoRA Adapter Routing using Task Representations", 6 "authors": [ 7 "Akash Dhasade", 8 "Anne-Marie Kermarrec", 9 "Igor Pavlovic", 10 "Diana Petrescu", 11 "Rafael Pires", 12 "Mathis Randl", 13 "Martijn de Vos" 14 ], 15 "year": 2026, 16 "venue": "arXiv.org", 17 "arxiv_id": "2601.21795", 18 "doi": "10.48550/arXiv.2601.21795" 19 }, 20 "checklist": { 21 "claims_and_evidence": { 22 "abstract_claims_supported": { 23 "applies": true, 24 "answer": true, 25 "justification": "Abstract claims of 101.2% Oracle performance and +5.2-point OOD improvement over LORARETRIEVER are directly supported by Figure 2 and Table 6; the 1500+ adapter scaling result is confirmed in Table 8.", 26 "source": "haiku" 27 }, 28 "causal_claims_justified": { 29 "applies": true, 30 "answer": true, 31 "justification": "Ablation studies in Table 2 isolate the retrieval and composition components independently, providing adequate support for causal claims about which components drive improvements.", 32 "source": "haiku" 33 }, 34 "generalization_bounded": { 35 "applies": true, 36 "answer": false, 37 "justification": "The paper makes broad claims about 'scalable routing for open-ended LoRA serving' but evaluates only on LLaMA2-7B/13B with a single FLANV2-derived benchmark; generalization to other base models, modalities, or task distributions is not empirically validated.", 38 "source": "haiku" 39 }, 40 "alternative_explanations_discussed": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper does not consider whether gains stem from the sentence encoder quality, the specific benchmark structure, or the particular adapter training setup rather than the task-level routing paradigm itself.", 44 "source": "haiku" 45 }, 46 "proxy_outcome_distinction": { 47 "applies": true, 48 "answer": true, 49 "justification": "The paper uses task-specific metrics (EM, BLEU, ROUGE) appropriate to each task type and employs oracle-normalized aggregation rather than conflating these into a single undifferentiated score.", 50 "source": "haiku" 51 } 52 }, 53 "limitations_and_scope": { 54 "limitations_section_present": { 55 "applies": true, 56 "answer": false, 57 "justification": "The paper has an 'Impact Statement' section discussing broader societal impacts but no dedicated limitations or threats-to-validity section addressing technical constraints.", 58 "source": "haiku" 59 }, 60 "threats_to_validity_specific": { 61 "applies": true, 62 "answer": false, 63 "justification": "No specific threats are discussed—the small test set of 50 samples per task, the potential contamination of FLAN data in LLaMA2 pretraining, and the restriction to a single benchmark are not acknowledged.", 64 "source": "haiku" 65 }, 66 "scope_boundaries_stated": { 67 "applies": true, 68 "answer": false, 69 "justification": "The paper does not state explicit scope boundaries, such as that results apply only to LLaMA2-class models, only to NLP tasks, or only to FLAN-style benchmarks.", 70 "source": "haiku" 71 } 72 }, 73 "conflicts_of_interest": { 74 "funding_disclosed": { 75 "applies": true, 76 "answer": false, 77 "justification": "No funding source is mentioned anywhere in the paper.", 78 "source": "haiku" 79 }, 80 "affiliations_disclosed": { 81 "applies": true, 82 "answer": true, 83 "justification": "All authors are listed as affiliated with EPFL, Lausanne, Switzerland, disclosed in the author line.", 84 "source": "haiku" 85 }, 86 "funder_independent_of_outcome": { 87 "applies": false, 88 "answer": false, 89 "justification": "No funder is disclosed, so independence cannot be assessed.", 90 "source": "haiku" 91 }, 92 "financial_interests_declared": { 93 "applies": true, 94 "answer": false, 95 "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is present.", 96 "source": "haiku" 97 } 98 }, 99 "scope_and_framing": { 100 "key_terms_defined": { 101 "applies": true, 102 "answer": true, 103 "justification": "LoRA, adapter routing, task representations, non-OOD, OOD, and semi-OOD are all formally defined in Sections 2 and 3.", 104 "source": "haiku" 105 }, 106 "intended_contribution_clear": { 107 "applies": true, 108 "answer": true, 109 "justification": "The contributions are explicitly enumerated: training-free black-box routing, O(T) efficiency via task-level routing, and Successive Halving for adapter selection.", 110 "source": "haiku" 111 }, 112 "engagement_with_prior_work": { 113 "applies": true, 114 "answer": true, 115 "justification": "Table 1 provides a structured comparison of LORAUTER against five prior routing approaches along key dimensions, and Section 5 situates the work within MoE, model routing, and task-representation literature.", 116 "source": "haiku" 117 } 118 } 119 }, 120 "type_checklist": { 121 "empirical": { 122 "artifacts": { 123 "code_released": { 124 "applies": true, 125 "answer": false, 126 "justification": "No code repository is linked or mentioned in the paper; only the sentence encoder from HuggingFace (https://huggingface.co/Styxxxx/lora_retriever) is cited as a reused artifact.", 127 "source": "haiku" 128 }, 129 "data_released": { 130 "applies": true, 131 "answer": true, 132 "justification": "The evaluation uses publicly available FLANV2 benchmark data and HuggingFace public adapters (1567 retrieved from the wild), both standard public resources used unmodified.", 133 "source": "haiku" 134 }, 135 "environment_specified": { 136 "applies": true, 137 "answer": false, 138 "justification": "Only bfloat16 precision and LoRA rank/alpha hyperparameters are mentioned; no requirements file, Docker image, or full dependency specification is provided.", 139 "source": "haiku" 140 }, 141 "reproduction_instructions": { 142 "applies": true, 143 "answer": false, 144 "justification": "Algorithm 1 provides pseudocode for Successive Halving but no end-to-end instructions for reproducing experiments including data preparation, adapter training, or evaluation pipeline.", 145 "source": "haiku" 146 } 147 }, 148 "statistical_methodology": { 149 "confidence_intervals_or_error_bars": { 150 "applies": true, 151 "answer": false, 152 "justification": "Standard deviation is reported only for the SH efficiency comparison (Figure 10) across 100 runs; main comparison results in Figure 2 and Table 6 report no uncertainty estimates.", 153 "source": "haiku" 154 }, 155 "significance_tests": { 156 "applies": true, 157 "answer": false, 158 "justification": "No statistical significance tests are applied to any of the comparative results despite the paper making multiple ranking claims across methods.", 159 "source": "haiku" 160 }, 161 "effect_sizes_reported": { 162 "applies": true, 163 "answer": true, 164 "justification": "Normalized performance percentages and point differences (e.g., +5.2 points over LORARETRIEVER in OOD) are reported throughout with explicit baseline context.", 165 "source": "haiku" 166 }, 167 "sample_size_justified": { 168 "applies": true, 169 "answer": false, 170 "justification": "The test set of 50 samples per task is adopted from Zhao et al. (2024) without discussion of whether this is sufficient for reliable per-task estimates.", 171 "source": "haiku" 172 }, 173 "variance_reported": { 174 "applies": true, 175 "answer": false, 176 "justification": "Variance across runs is reported only for the SH budget experiment (Figure 10); main results tables contain point estimates only.", 177 "source": "haiku" 178 } 179 }, 180 "evaluation_design": { 181 "baselines_included": { 182 "applies": true, 183 "answer": true, 184 "justification": "Four baselines are included: LORAHUB, LORARETRIEVER, ARROW, and SpectR, plus an oracle task-aligned upper bound.", 185 "source": "haiku" 186 }, 187 "baselines_contemporary": { 188 "applies": true, 189 "answer": true, 190 "justification": "All baselines are recent (ICLR 2024, COLM 2024, FindingsACL 2024, ICML 2024, COLM 2025), representing the current state of the field.", 191 "source": "haiku" 192 }, 193 "ablation_study": { 194 "applies": true, 195 "answer": true, 196 "justification": "Table 2 ablates retrieval and composition components independently by swapping LORARETRIEVER and LORAUTER components; Table 3 ablates K=1 vs K=3 fusion.", 197 "source": "haiku" 198 }, 199 "multiple_metrics": { 200 "applies": true, 201 "answer": true, 202 "justification": "Task-appropriate metrics are used: EM for classification, BLEU for translation, ROUGE-1/2/L for generation tasks, aggregated via oracle-normalized average.", 203 "source": "haiku" 204 }, 205 "human_evaluation": { 206 "applies": false, 207 "answer": false, 208 "justification": "Human evaluation is not relevant for adapter routing on established NLP benchmarks with automated metrics.", 209 "source": "haiku" 210 }, 211 "held_out_test_set": { 212 "applies": true, 213 "answer": true, 214 "justification": "Routing uses small validation sets (up to 200 samples) while final evaluation uses disjoint held-out test sets of 50 samples per task, consistent with Zhao et al. (2024).", 215 "source": "haiku" 216 }, 217 "per_category_breakdown": { 218 "applies": true, 219 "answer": true, 220 "justification": "Tables 11-18 provide per-task breakdowns across all 48 tasks grouped by category (struct-to-text, translation, commonsense, sentiment, reading comp, NLI, etc.).", 221 "source": "haiku" 222 }, 223 "failure_cases_discussed": { 224 "applies": true, 225 "answer": true, 226 "justification": "The paper discusses that selection-based methods 'collapse' in OOD/Semi-OOD settings, and notes spectral routing methods perform worse because parameter values carry insufficient routing signal.", 227 "source": "haiku" 228 }, 229 "negative_results_reported": { 230 "applies": true, 231 "answer": true, 232 "justification": "The paper reports that using too many or too few K-Means clusters degrades performance, that K=2 outperforms K=3 on some metrics, and that the HF-only adapter pool reduces performance vs. curated adapters.", 233 "source": "haiku" 234 } 235 }, 236 "setup_transparency": { 237 "model_versions_specified": { 238 "applies": true, 239 "answer": true, 240 "justification": "LLaMA2-7B and LLaMA2-13B are specified with HuggingFace reference (meta-llama/Llama-2-7b-hf); the sentence encoder URL is provided (https://huggingface.co/Styxxxx/lora_retriever).", 241 "source": "haiku" 242 }, 243 "prompts_provided": { 244 "applies": true, 245 "answer": false, 246 "justification": "The embedding instruction is quoted ('Represent the sentence for similar task retrieval') and Alpaca format is referenced, but full prompts used for task evaluation are not provided.", 247 "source": "haiku" 248 }, 249 "hyperparameters_reported": { 250 "applies": true, 251 "answer": true, 252 "justification": "LoRA rank r=6, scaling α=12, softmax temperature τ=0.2, K=3 adapters for fusion, and SH parameters (η, γ, R, warmup k) are reported.", 253 "source": "haiku" 254 }, 255 "scaffolding_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No agentic scaffolding is involved; this is standard inference with composed adapter weights.", 259 "source": "haiku" 260 }, 261 "data_preprocessing_documented": { 262 "applies": true, 263 "answer": false, 264 "justification": "The paper states it uses FLANV2 tasks and Alpaca instruction format but does not document the full preprocessing pipeline for constructing the 48-task evaluation set from FLANV2.", 265 "source": "haiku" 266 } 267 }, 268 "data_integrity": { 269 "raw_data_available": { 270 "applies": true, 271 "answer": true, 272 "justification": "The underlying benchmark (FLANV2 subset from Zhao et al. 2024) uses publicly available datasets; the 1567 HuggingFace adapters are publicly accessible.", 273 "source": "haiku" 274 }, 275 "data_collection_described": { 276 "applies": true, 277 "answer": true, 278 "justification": "The benchmark construction is described: 48 tasks from FLANV2, 200 validation samples per task, 50 held-out test samples; HF adapters filtered by rank ≤64 for LLaMA2-7B.", 279 "source": "haiku" 280 }, 281 "recruitment_methods_described": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants; benchmark data uses standard NLP datasets.", 285 "source": "haiku" 286 }, 287 "data_pipeline_documented": { 288 "applies": true, 289 "answer": false, 290 "justification": "The evaluation pipeline is described conceptually but the full FLANV2 → 48-task subset derivation, adapter training procedure, and validation/test split construction are not fully documented.", 291 "source": "haiku" 292 } 293 }, 294 "contamination": { 295 "training_cutoff_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "LLaMA2's training data cutoff is not stated, though the model's pretraining on FLAN-style data could affect evaluation on FLANV2-derived tasks.", 299 "source": "haiku" 300 }, 301 "train_test_overlap_discussed": { 302 "applies": true, 303 "answer": false, 304 "justification": "No discussion of whether FLANV2 tasks or their test splits were included in LLaMA2's pretraining corpus, which is a real concern for exact-match evaluation tasks.", 305 "source": "haiku" 306 }, 307 "benchmark_contamination_addressed": { 308 "applies": true, 309 "answer": false, 310 "justification": "FLANV2 tasks were publicly available before LLaMA2's training cutoff; potential contamination is not acknowledged or addressed.", 311 "source": "haiku" 312 } 313 }, 314 "human_studies": { 315 "pre_registered": { 316 "applies": false, 317 "answer": false, 318 "justification": "No human participants.", 319 "source": "haiku" 320 }, 321 "irb_or_ethics_approval": { 322 "applies": false, 323 "answer": false, 324 "justification": "No human participants.", 325 "source": "haiku" 326 }, 327 "demographics_reported": { 328 "applies": false, 329 "answer": false, 330 "justification": "No human participants.", 331 "source": "haiku" 332 }, 333 "inclusion_exclusion_criteria": { 334 "applies": false, 335 "answer": false, 336 "justification": "No human participants.", 337 "source": "haiku" 338 }, 339 "randomization_described": { 340 "applies": false, 341 "answer": false, 342 "justification": "No human participants.", 343 "source": "haiku" 344 }, 345 "blinding_described": { 346 "applies": false, 347 "answer": false, 348 "justification": "No human participants.", 349 "source": "haiku" 350 }, 351 "attrition_reported": { 352 "applies": false, 353 "answer": false, 354 "justification": "No human participants.", 355 "source": "haiku" 356 } 357 }, 358 "cost_and_practicality": { 359 "inference_cost_reported": { 360 "applies": true, 361 "answer": true, 362 "justification": "Table 1 reports routing overhead complexity (O(T) vs O(N) vs O(NL)), and Section 4.5 and Figure 5 quantify the compute budget (adapter evaluations) for adapter selection under SH.", 363 "source": "haiku" 364 }, 365 "compute_budget_stated": { 366 "applies": true, 367 "answer": false, 368 "justification": "Total GPU hours or compute cost for running all experiments is not reported.", 369 "source": "haiku" 370 } 371 } 372 } 373 }, 374 "claims": [ 375 { 376 "claim": "LORAUTER achieves 101.2% of Oracle task-aligned performance in non-OOD settings on LLaMA2-7B, effectively matching the upper bound of always selecting the perfect adapter.", 377 "evidence": "Figure 2 and Table 6 show normalized average performance of 101.2% for LORAUTER vs 100% oracle on LLaMA2-7B non-OOD; confirmed by Table 11 per-task results.", 378 "supported": "strong" 379 }, 380 { 381 "claim": "LORAUTER outperforms the strongest baseline (LORARETRIEVER) by +5.2 points in OOD settings on LLaMA2-7B.", 382 "evidence": "Figure 2 shows 88.4% (LORAUTER) vs 83.2% (LORARETRIEVER) in OOD on LLaMA2-7B; similar gap on LLaMA2-13B (86.8% vs 85.9%).", 383 "supported": "strong" 384 }, 385 { 386 "claim": "Task-level routing scales more efficiently than adapter-level routing, with O(T) complexity where T < N.", 387 "evidence": "Table 1 compares complexity across methods; empirically demonstrated by maintaining competitive performance with 1500+ adapters where O(N) methods become infeasible.", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "Successive Halving reduces the adapter evaluation budget by more than 2x compared to uniform selection with negligible performance loss.", 392 "evidence": "Figure 5 and Figure 10 show SH reaches near-peak normalized performance (~0.95) at roughly half the evaluation budget of uniform selection, across 100 independent runs with std. dev. reported.", 393 "supported": "strong" 394 }, 395 { 396 "claim": "LORAUTER scales to 1500+ heterogeneous 'wild' adapters from HuggingFace, achieving 85.7% normalized performance (vs 88.4% with curated adapters) in OOD settings.", 397 "evidence": "Table 7 and Table 8 report per-task and aggregate results for HF-only and HF+48 adapter pools, showing competitive performance despite no curated adapters.", 398 "supported": "strong" 399 }, 400 { 401 "claim": "Both the retrieval and composition components of LORAUTER independently contribute to performance gains over LORARETRIEVER.", 402 "evidence": "Table 2 shows: LR retrieval + LA composition = 98.6% (non-OOD 7B); LA retrieval + LR composition = 96.7%; both together = 101.2%, vs LR+LR baseline of 92.9%.", 403 "supported": "strong" 404 } 405 ], 406 "methodology_tags": [ 407 "benchmark-eval" 408 ], 409 "key_findings": "LORAUTER is a training-free LoRA adapter routing framework that routes queries through task representations rather than directly to adapters, requiring only small validation sets and no adapter training data. In non-OOD settings it matches or slightly exceeds oracle performance (101.2%) by composing complementary task-relevant adapters with input-aware weighted fusion. In OOD settings it outperforms the best prior method (LORARETRIEVER) by 5.2 percentage points on LLaMA2-7B. The Successive Halving strategy reduces adapter evaluation cost by more than 2x while maintaining near-peak selection quality, and the framework remains effective when scaled to pools of 1500+ heterogeneous public HuggingFace adapters.", 410 "red_flags": [ 411 { 412 "flag": "No significance tests", 413 "detail": "All comparative claims are presented as point estimates without statistical significance testing; given 50-sample test sets, many differences may not be statistically distinguishable." 414 }, 415 { 416 "flag": "No code release", 417 "detail": "No repository or implementation is shared, making independent reproduction impossible beyond the algorithmic description." 418 }, 419 { 420 "flag": "No limitations section", 421 "detail": "The paper has no dedicated limitations or threats-to-validity section; the Impact Statement discusses societal concerns but not methodological constraints." 422 }, 423 { 424 "flag": "Single benchmark", 425 "detail": "All experiments use the same FLANV2-derived 48-task benchmark from Zhao et al. (2024); generalization to other domains, modalities, or base models is unvalidated." 426 }, 427 { 428 "flag": "Benchmark contamination unaddressed", 429 "detail": "FLANV2 tasks were available before LLaMA2's training cutoff; potential overlap between training data and evaluation benchmarks is not acknowledged." 430 }, 431 { 432 "flag": "Small per-task test sets", 433 "detail": "With only 50 held-out samples per task, individual task results (e.g., EM scores that change by 2-4 points) may reflect noise rather than true method differences." 434 } 435 ], 436 "cited_papers": [ 437 { 438 "title": "LoraRetriever: Input-aware LoRA Retrieval and Composition for Mixed Tasks in the Wild", 439 "relevance": "Primary baseline and benchmark source; LORAUTER directly compares against and extends this work on adapter routing" 440 }, 441 { 442 "title": "LoRA: Low-Rank Adaptation of Large Language Models", 443 "relevance": "Foundational method that LORAUTER builds upon for parameter-efficient fine-tuning" 444 }, 445 { 446 "title": "LoraHub: Efficient Cross-Task Generalization via Dynamic LoRA Composition", 447 "relevance": "Key baseline for adapter composition using learned fusion weights" 448 }, 449 { 450 "title": "Towards Modular LLMs by Building and Reusing a Library of LoRAs (ARROW)", 451 "relevance": "Spectral routing baseline; representative of parameter-space routing approaches" 452 }, 453 { 454 "title": "Mixture of LoRA Experts (MoLE)", 455 "relevance": "MoE-style baseline requiring training data for routing" 456 }, 457 { 458 "title": "AdapterSoup: Weight Averaging to Improve Generalization of Pretrained Language Models", 459 "relevance": "Baseline approach for adapter composition via weight averaging" 460 }, 461 { 462 "title": "Finetuned Language Models are Zero-Shot Learners (FLAN)", 463 "relevance": "Source of the evaluation benchmark used throughout experiments" 464 }, 465 { 466 "title": "SpectR: Dynamically Composing LM Experts with Spectral Routing", 467 "relevance": "Recent spectral routing baseline evaluated as a competitor" 468 }, 469 { 470 "title": "Non-stochastic Best Arm Identification and Hyperparameter Optimization (Successive Halving)", 471 "relevance": "Core algorithm adopted for efficient adapter selection within LORAUTER" 472 } 473 ], 474 "engagement_factors": { 475 "practical_relevance": { 476 "score": 3, 477 "justification": "Directly applicable to any practitioner using the 2300+ public LoRA adapters on HuggingFace without access to training data." 478 }, 479 "surprise_contrarian": { 480 "score": 1, 481 "justification": "The task-level routing insight is logical and incrementally novel rather than surprising; the >oracle result (101.2%) is mildly interesting." 482 }, 483 "fear_safety": { 484 "score": 0, 485 "justification": "No AI safety concerns raised beyond a brief mention of inherited biases in the Impact Statement." 486 }, 487 "drama_conflict": { 488 "score": 1, 489 "justification": "Mild competitive framing against LORARETRIEVER with clear margin claims, but no broader controversy." 490 }, 491 "demo_ability": { 492 "score": 2, 493 "justification": "The framework could be tried with public HuggingFace adapters, though no code is released to lower the barrier." 494 }, 495 "brand_recognition": { 496 "score": 1, 497 "justification": "EPFL is a well-regarded research institution but not a top-tier AI lab; no famous authors or product associations." 498 } 499 }, 500 "hn_data": { 501 "threads": [], 502 "top_points": 0, 503 "total_points": 0, 504 "total_comments": 0 505 } 506 }