scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24299B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Effective LoRA Adapter Routing using Task Representations",
      6     "authors": [
      7       "Akash Dhasade",
      8       "Anne-Marie Kermarrec",
      9       "Igor Pavlovic",
     10       "Diana Petrescu",
     11       "Rafael Pires",
     12       "Mathis Randl",
     13       "Martijn de Vos"
     14     ],
     15     "year": 2026,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2601.21795",
     18     "doi": "10.48550/arXiv.2601.21795"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Abstract claims of 101.2% Oracle performance and +5.2-point OOD improvement over LORARETRIEVER are directly supported by Figure 2 and Table 6; the 1500+ adapter scaling result is confirmed in Table 8.",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Ablation studies in Table 2 isolate the retrieval and composition components independently, providing adequate support for causal claims about which components drive improvements.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper makes broad claims about 'scalable routing for open-ended LoRA serving' but evaluates only on LLaMA2-7B/13B with a single FLANV2-derived benchmark; generalization to other base models, modalities, or task distributions is not empirically validated.",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper does not consider whether gains stem from the sentence encoder quality, the specific benchmark structure, or the particular adapter training setup rather than the task-level routing paradigm itself.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper uses task-specific metrics (EM, BLEU, ROUGE) appropriate to each task type and employs oracle-normalized aggregation rather than conflating these into a single undifferentiated score.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper has an 'Impact Statement' section discussing broader societal impacts but no dedicated limitations or threats-to-validity section addressing technical constraints.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No specific threats are discussed—the small test set of 50 samples per task, the potential contamination of FLAN data in LLaMA2 pretraining, and the restriction to a single benchmark are not acknowledged.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The paper does not state explicit scope boundaries, such as that results apply only to LLaMA2-class models, only to NLP tasks, or only to FLAN-style benchmarks.",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding source is mentioned anywhere in the paper.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "All authors are listed as affiliated with EPFL, Lausanne, Switzerland, disclosed in the author line.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "No funder is disclosed, so independence cannot be assessed.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is present.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "LoRA, adapter routing, task representations, non-OOD, OOD, and semi-OOD are all formally defined in Sections 2 and 3.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The contributions are explicitly enumerated: training-free black-box routing, O(T) efficiency via task-level routing, and Successive Halving for adapter selection.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Table 1 provides a structured comparison of LORAUTER against five prior routing approaches along key dimensions, and Section 5 situates the work within MoE, model routing, and task-representation literature.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "No code repository is linked or mentioned in the paper; only the sentence encoder from HuggingFace (https://huggingface.co/Styxxxx/lora_retriever) is cited as a reused artifact.",
    127           "source": "haiku"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "The evaluation uses publicly available FLANV2 benchmark data and HuggingFace public adapters (1567 retrieved from the wild), both standard public resources used unmodified.",
    133           "source": "haiku"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Only bfloat16 precision and LoRA rank/alpha hyperparameters are mentioned; no requirements file, Docker image, or full dependency specification is provided.",
    139           "source": "haiku"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "Algorithm 1 provides pseudocode for Successive Halving but no end-to-end instructions for reproducing experiments including data preparation, adapter training, or evaluation pipeline.",
    145           "source": "haiku"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Standard deviation is reported only for the SH efficiency comparison (Figure 10) across 100 runs; main comparison results in Figure 2 and Table 6 report no uncertainty estimates.",
    153           "source": "haiku"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No statistical significance tests are applied to any of the comparative results despite the paper making multiple ranking claims across methods.",
    159           "source": "haiku"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Normalized performance percentages and point differences (e.g., +5.2 points over LORARETRIEVER in OOD) are reported throughout with explicit baseline context.",
    165           "source": "haiku"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "The test set of 50 samples per task is adopted from Zhao et al. (2024) without discussion of whether this is sufficient for reliable per-task estimates.",
    171           "source": "haiku"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "Variance across runs is reported only for the SH budget experiment (Figure 10); main results tables contain point estimates only.",
    177           "source": "haiku"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Four baselines are included: LORAHUB, LORARETRIEVER, ARROW, and SpectR, plus an oracle task-aligned upper bound.",
    185           "source": "haiku"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "All baselines are recent (ICLR 2024, COLM 2024, FindingsACL 2024, ICML 2024, COLM 2025), representing the current state of the field.",
    191           "source": "haiku"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Table 2 ablates retrieval and composition components independently by swapping LORARETRIEVER and LORAUTER components; Table 3 ablates K=1 vs K=3 fusion.",
    197           "source": "haiku"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Task-appropriate metrics are used: EM for classification, BLEU for translation, ROUGE-1/2/L for generation tasks, aggregated via oracle-normalized average.",
    203           "source": "haiku"
    204         },
    205         "human_evaluation": {
    206           "applies": false,
    207           "answer": false,
    208           "justification": "Human evaluation is not relevant for adapter routing on established NLP benchmarks with automated metrics.",
    209           "source": "haiku"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Routing uses small validation sets (up to 200 samples) while final evaluation uses disjoint held-out test sets of 50 samples per task, consistent with Zhao et al. (2024).",
    215           "source": "haiku"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Tables 11-18 provide per-task breakdowns across all 48 tasks grouped by category (struct-to-text, translation, commonsense, sentiment, reading comp, NLI, etc.).",
    221           "source": "haiku"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "The paper discusses that selection-based methods 'collapse' in OOD/Semi-OOD settings, and notes spectral routing methods perform worse because parameter values carry insufficient routing signal.",
    227           "source": "haiku"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "The paper reports that using too many or too few K-Means clusters degrades performance, that K=2 outperforms K=3 on some metrics, and that the HF-only adapter pool reduces performance vs. curated adapters.",
    233           "source": "haiku"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "LLaMA2-7B and LLaMA2-13B are specified with HuggingFace reference (meta-llama/Llama-2-7b-hf); the sentence encoder URL is provided (https://huggingface.co/Styxxxx/lora_retriever).",
    241           "source": "haiku"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "The embedding instruction is quoted ('Represent the sentence for similar task retrieval') and Alpaca format is referenced, but full prompts used for task evaluation are not provided.",
    247           "source": "haiku"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "LoRA rank r=6, scaling α=12, softmax temperature τ=0.2, K=3 adapters for fusion, and SH parameters (η, γ, R, warmup k) are reported.",
    253           "source": "haiku"
    254         },
    255         "scaffolding_described": {
    256           "applies": false,
    257           "answer": false,
    258           "justification": "No agentic scaffolding is involved; this is standard inference with composed adapter weights.",
    259           "source": "haiku"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": false,
    264           "justification": "The paper states it uses FLANV2 tasks and Alpaca instruction format but does not document the full preprocessing pipeline for constructing the 48-task evaluation set from FLANV2.",
    265           "source": "haiku"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "The underlying benchmark (FLANV2 subset from Zhao et al. 2024) uses publicly available datasets; the 1567 HuggingFace adapters are publicly accessible.",
    273           "source": "haiku"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "The benchmark construction is described: 48 tasks from FLANV2, 200 validation samples per task, 50 held-out test samples; HF adapters filtered by rank ≤64 for LLaMA2-7B.",
    279           "source": "haiku"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants; benchmark data uses standard NLP datasets.",
    285           "source": "haiku"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": false,
    290           "justification": "The evaluation pipeline is described conceptually but the full FLANV2 → 48-task subset derivation, adapter training procedure, and validation/test split construction are not fully documented.",
    291           "source": "haiku"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "LLaMA2's training data cutoff is not stated, though the model's pretraining on FLAN-style data could affect evaluation on FLANV2-derived tasks.",
    299           "source": "haiku"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No discussion of whether FLANV2 tasks or their test splits were included in LLaMA2's pretraining corpus, which is a real concern for exact-match evaluation tasks.",
    305           "source": "haiku"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "FLANV2 tasks were publicly available before LLaMA2's training cutoff; potential contamination is not acknowledged or addressed.",
    311           "source": "haiku"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants.",
    319           "source": "haiku"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants.",
    325           "source": "haiku"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants.",
    331           "source": "haiku"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants.",
    337           "source": "haiku"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants.",
    343           "source": "haiku"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants.",
    349           "source": "haiku"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants.",
    355           "source": "haiku"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": true,
    362           "justification": "Table 1 reports routing overhead complexity (O(T) vs O(N) vs O(NL)), and Section 4.5 and Figure 5 quantify the compute budget (adapter evaluations) for adapter selection under SH.",
    363           "source": "haiku"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "Total GPU hours or compute cost for running all experiments is not reported.",
    369           "source": "haiku"
    370         }
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "LORAUTER achieves 101.2% of Oracle task-aligned performance in non-OOD settings on LLaMA2-7B, effectively matching the upper bound of always selecting the perfect adapter.",
    377       "evidence": "Figure 2 and Table 6 show normalized average performance of 101.2% for LORAUTER vs 100% oracle on LLaMA2-7B non-OOD; confirmed by Table 11 per-task results.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "LORAUTER outperforms the strongest baseline (LORARETRIEVER) by +5.2 points in OOD settings on LLaMA2-7B.",
    382       "evidence": "Figure 2 shows 88.4% (LORAUTER) vs 83.2% (LORARETRIEVER) in OOD on LLaMA2-7B; similar gap on LLaMA2-13B (86.8% vs 85.9%).",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Task-level routing scales more efficiently than adapter-level routing, with O(T) complexity where T < N.",
    387       "evidence": "Table 1 compares complexity across methods; empirically demonstrated by maintaining competitive performance with 1500+ adapters where O(N) methods become infeasible.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Successive Halving reduces the adapter evaluation budget by more than 2x compared to uniform selection with negligible performance loss.",
    392       "evidence": "Figure 5 and Figure 10 show SH reaches near-peak normalized performance (~0.95) at roughly half the evaluation budget of uniform selection, across 100 independent runs with std. dev. reported.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "LORAUTER scales to 1500+ heterogeneous 'wild' adapters from HuggingFace, achieving 85.7% normalized performance (vs 88.4% with curated adapters) in OOD settings.",
    397       "evidence": "Table 7 and Table 8 report per-task and aggregate results for HF-only and HF+48 adapter pools, showing competitive performance despite no curated adapters.",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "Both the retrieval and composition components of LORAUTER independently contribute to performance gains over LORARETRIEVER.",
    402       "evidence": "Table 2 shows: LR retrieval + LA composition = 98.6% (non-OOD 7B); LA retrieval + LR composition = 96.7%; both together = 101.2%, vs LR+LR baseline of 92.9%.",
    403       "supported": "strong"
    404     }
    405   ],
    406   "methodology_tags": [
    407     "benchmark-eval"
    408   ],
    409   "key_findings": "LORAUTER is a training-free LoRA adapter routing framework that routes queries through task representations rather than directly to adapters, requiring only small validation sets and no adapter training data. In non-OOD settings it matches or slightly exceeds oracle performance (101.2%) by composing complementary task-relevant adapters with input-aware weighted fusion. In OOD settings it outperforms the best prior method (LORARETRIEVER) by 5.2 percentage points on LLaMA2-7B. The Successive Halving strategy reduces adapter evaluation cost by more than 2x while maintaining near-peak selection quality, and the framework remains effective when scaled to pools of 1500+ heterogeneous public HuggingFace adapters.",
    410   "red_flags": [
    411     {
    412       "flag": "No significance tests",
    413       "detail": "All comparative claims are presented as point estimates without statistical significance testing; given 50-sample test sets, many differences may not be statistically distinguishable."
    414     },
    415     {
    416       "flag": "No code release",
    417       "detail": "No repository or implementation is shared, making independent reproduction impossible beyond the algorithmic description."
    418     },
    419     {
    420       "flag": "No limitations section",
    421       "detail": "The paper has no dedicated limitations or threats-to-validity section; the Impact Statement discusses societal concerns but not methodological constraints."
    422     },
    423     {
    424       "flag": "Single benchmark",
    425       "detail": "All experiments use the same FLANV2-derived 48-task benchmark from Zhao et al. (2024); generalization to other domains, modalities, or base models is unvalidated."
    426     },
    427     {
    428       "flag": "Benchmark contamination unaddressed",
    429       "detail": "FLANV2 tasks were available before LLaMA2's training cutoff; potential overlap between training data and evaluation benchmarks is not acknowledged."
    430     },
    431     {
    432       "flag": "Small per-task test sets",
    433       "detail": "With only 50 held-out samples per task, individual task results (e.g., EM scores that change by 2-4 points) may reflect noise rather than true method differences."
    434     }
    435   ],
    436   "cited_papers": [
    437     {
    438       "title": "LoraRetriever: Input-aware LoRA Retrieval and Composition for Mixed Tasks in the Wild",
    439       "relevance": "Primary baseline and benchmark source; LORAUTER directly compares against and extends this work on adapter routing"
    440     },
    441     {
    442       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    443       "relevance": "Foundational method that LORAUTER builds upon for parameter-efficient fine-tuning"
    444     },
    445     {
    446       "title": "LoraHub: Efficient Cross-Task Generalization via Dynamic LoRA Composition",
    447       "relevance": "Key baseline for adapter composition using learned fusion weights"
    448     },
    449     {
    450       "title": "Towards Modular LLMs by Building and Reusing a Library of LoRAs (ARROW)",
    451       "relevance": "Spectral routing baseline; representative of parameter-space routing approaches"
    452     },
    453     {
    454       "title": "Mixture of LoRA Experts (MoLE)",
    455       "relevance": "MoE-style baseline requiring training data for routing"
    456     },
    457     {
    458       "title": "AdapterSoup: Weight Averaging to Improve Generalization of Pretrained Language Models",
    459       "relevance": "Baseline approach for adapter composition via weight averaging"
    460     },
    461     {
    462       "title": "Finetuned Language Models are Zero-Shot Learners (FLAN)",
    463       "relevance": "Source of the evaluation benchmark used throughout experiments"
    464     },
    465     {
    466       "title": "SpectR: Dynamically Composing LM Experts with Spectral Routing",
    467       "relevance": "Recent spectral routing baseline evaluated as a competitor"
    468     },
    469     {
    470       "title": "Non-stochastic Best Arm Identification and Hyperparameter Optimization (Successive Halving)",
    471       "relevance": "Core algorithm adopted for efficient adapter selection within LORAUTER"
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 3,
    477       "justification": "Directly applicable to any practitioner using the 2300+ public LoRA adapters on HuggingFace without access to training data."
    478     },
    479     "surprise_contrarian": {
    480       "score": 1,
    481       "justification": "The task-level routing insight is logical and incrementally novel rather than surprising; the >oracle result (101.2%) is mildly interesting."
    482     },
    483     "fear_safety": {
    484       "score": 0,
    485       "justification": "No AI safety concerns raised beyond a brief mention of inherited biases in the Impact Statement."
    486     },
    487     "drama_conflict": {
    488       "score": 1,
    489       "justification": "Mild competitive framing against LORARETRIEVER with clear margin claims, but no broader controversy."
    490     },
    491     "demo_ability": {
    492       "score": 2,
    493       "justification": "The framework could be tried with public HuggingFace adapters, though no code is released to lower the barrier."
    494     },
    495     "brand_recognition": {
    496       "score": 1,
    497       "justification": "EPFL is a well-regarded research institution but not a top-tier AI lab; no famous authors or product associations."
    498     }
    499   },
    500   "hn_data": {
    501     "threads": [],
    502     "top_points": 0,
    503     "total_points": 0,
    504     "total_comments": 0
    505   }
    506 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs