scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (31951B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Effective LoRA Adapter Routing using Task Representations",
      6     "authors": [
      7       "Akash Dhasade",
      8       "Anne-Marie Kermarrec",
      9       "Igor Pavlovic",
     10       "Diana Petrescu",
     11       "Rafael Pires",
     12       "Mathis Randl",
     13       "Martijn de Vos"
     14     ],
     15     "year": 2026,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2601.21795",
     18     "doi": "10.48550/arXiv.2601.21795"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Abstract claims are supported: 'matching Oracle performance (101.2%)' is in Figure 2 and Table 6; '+5.2 points' OOD improvement over LORARETRIEVER matches Figure 2; scaling to 1500+ adapters is in Section 4.2 and Tables 7-8.",
     26         "source": "opus"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper makes causal claims through ablation studies in Section 4.3 (Table 2): swapping individual components (retrieval, composition) demonstrates their individual causal contributions. These are controlled single-variable manipulations adequate for the causal claims made.",
     32         "source": "opus"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper tests only on LLaMA2 (7B and 13B) and FLANV2 tasks but makes broad claims: 'a practical foundation for leveraging large, dynamic adapter pools for LoRA routing' (Section 6). The title 'Effective LoRA Adapter Routing' implies general applicability beyond the tested model family and task set.",
     38         "source": "opus"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No alternative explanations for the results are discussed. The paper does not consider confounds such as whether gains come from the specific task taxonomy used, the quality of the sentence encoder, or dataset-specific properties of FLANV2.",
     44         "source": "opus"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper's claims match the granularity of its measurements. It claims routing effectiveness and measures it via task-specific metrics (EM, BLEU, ROUGE) normalized by oracle performance. No broader framing gap exists between what is measured and what is claimed.",
     50         "source": "opus"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No dedicated Limitations section exists. The Impact Statement discusses potential biases and misuse risks but does not address methodological limitations of the study itself.",
     58         "source": "opus"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No specific threats to validity are discussed. The Impact Statement mentions 'potential biases, privacy risks, or harmful behaviors' inherited from base models but these are generic deployment concerns, not study-specific methodological threats.",
     64         "source": "opus"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No explicit statements about what the results do NOT show. The paper does not acknowledge that results are limited to LLaMA2, FLANV2 tasks, or the specific adapter training configuration.",
     70         "source": "opus"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors.",
     78         "source": "opus"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "All authors are listed as affiliated with EPFL, Lausanne, Switzerland. They are not evaluating a commercial product from their institution.",
     84         "source": "opus"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "Cannot assess funder independence since no funding is disclosed. The work is academic (EPFL) and does not evaluate a commercial product, but without funding disclosure, independence cannot be verified.",
     90         "source": "opus"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests or financial interests statement is present in the paper.",
     96         "source": "opus"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "LoRA is formally defined in Section 2.1 with mathematical notation, adapter routing is formalized in Section 2.3 with explicit problem formulation, and task representations are precisely defined in Section 3.3.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper explicitly lists three contributions as bullet points: training-free black-box routing, efficiency (O(T) scaling with Successive Halving), and extensive evaluation across large adapter pools.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 2.4 and Table 1 systematically compare LORAUTER against five prior methods along key dimensions (training data, training-free, overhead), and Section 5 situates the work within MoE, model routing, and task representation literatures.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "No code repository URL, GitHub link, or archive is provided anywhere in the paper.",
    127           "source": "opus"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "The paper uses publicly available FLANV2 datasets (Wei et al., 2022) and publicly available HuggingFace LoRA adapters. The specific adapter pool URL is provided (huggingface.co/models?other=base_model:adapter:meta-llama/Llama-2-7b-hf), and the sentence encoder URL is given (huggingface.co/Styxxxx/lora_retriever).",
    133           "source": "opus"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "The paper mentions bfloat16 precision and model names (LLaMA2-7B/13B) but provides no requirements.txt, Dockerfile, or detailed dependency listing sufficient to recreate the environment.",
    139           "source": "opus"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No step-by-step reproduction instructions, README, or runnable scripts are described.",
    145           "source": "opus"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Main routing results (Figures 2-3, Tables 2-6) report only point estimates of normalized performance with no confidence intervals or error bars. Figure 10 shows standard deviation for SH over 100 runs, but this is for a supplementary analysis, not the core claims.",
    153           "source": "opus"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No statistical significance tests are used. Claims like 'LORAUTER outperforms LORARETRIEVER by +5.2 points' are based solely on comparing point estimates without any test (p-values, bootstrap, etc.).",
    159           "source": "opus"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "The paper reports percentage-point improvements with baseline context: '+5.2 percentage points' over LORARETRIEVER in OOD (88.4% vs 83.2%), '101.2% of oracle' in non-OOD, and '>2x reduction' in adapter evaluations via SH. Absolute and relative figures are consistently provided.",
    165           "source": "opus"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "50 test samples per task and at most 200 validation samples per task are used without any justification for why these sizes are sufficient. No power analysis is provided.",
    171           "source": "opus"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "Main routing results (Tables 2, 3, 6) are single-point estimates with no variance or standard deviation across runs. The task representation construction involves random sampling of validation queries (Section 3.3), but the sensitivity of main results to this randomness is not assessed. Only the SH analysis (Figure 10) reports std dev over 100 runs.",
    177           "source": "opus"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Four baselines are compared: LORAHUB, LORARETRIEVER, ARROW, and SpectR, plus an Oracle task-aligned adapter (Section 4.1). All are evaluated under identical conditions.",
    185           "source": "opus"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "All baselines are recent: MOLE (ICLR24), LORAHUB (COLM24), LORARETRIEVER (FindingsACL24), ARROW (ICML24), SpectR (2025). These represent the current state of the art in LoRA adapter routing.",
    191           "source": "opus"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Section 4.3 presents two ablation studies: (1) swapping retrieval and composition components between LORARETRIEVER and LORAUTER (Table 2), and (2) K=1 vs K=3 to show composition benefit (Table 3). Section 4.4 studies pseudo-task clusters. Table 10 ablates K values from 1-5.",
    197           "source": "opus"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Multiple task-specific metrics are used: Exact Match (EM) for NLU tasks, BLEU for translation, and ROUGE-1/2/L for structure-to-text generation (Section 4.1). Results are also reported as normalized averages.",
    203           "source": "opus"
    204         },
    205         "human_evaluation": {
    206           "applies": true,
    207           "answer": false,
    208           "justification": "All evaluation is fully automated using EM, BLEU, and ROUGE metrics. No human evaluation of output quality is conducted.",
    209           "source": "opus"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Section 4.1 states: 'For the final evaluation, we use the same test set as Zhao et al. (2024), consisting of 50 held-out test samples for each of the 48 tasks.' Validation sets are explicitly separated and used only for routing/adapter selection.",
    215           "source": "opus"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Tables 6, 11, 12 provide per-domain breakdowns (struct-to-text, commonsense, sentiment, etc.) and Tables 11-18 in the appendix provide full per-task results for all 48 tasks across all methods and settings.",
    221           "source": "opus"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": false,
    226           "justification": "While aggregate performance degradation in OOD settings is noted, no qualitative failure analysis is provided. The paper does not examine specific queries or tasks where LORAUTER fails or produces poor outputs, or explain why particular routing decisions go wrong.",
    227           "source": "opus"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "Several negative results are reported: K=1 (no composition) significantly underperforms K=3 (Table 3); selection-based methods collapse in OOD (Section B.1, Table 4); too few or too many pseudo-task clusters hurt performance (Figure 4); and baselines like LORAHUB and SpectR substantially underperform.",
    233           "source": "opus"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Specific model identifiers are provided: 'LLaMA 2-7B,13B (Touvron et al., 2023b)' with reference to the specific paper. The sentence encoder is identified by HuggingFace URL (huggingface.co/Styxxxx/lora_retriever).",
    241           "source": "opus"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "The embedding instruction is provided ('Represent the sentence for similar task retrieval', Section 3.3), but the actual task evaluation prompts used for each of the 48 FLANV2 tasks are not reproduced. The paper references Alpaca instruction format and FLANV2 verbalization but does not include the prompt text.",
    247           "source": "opus"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "Key hyperparameters are reported in Section 4.1 and Appendix A: K=3 for top-K fusion, τ=0.2 softmax temperature, LoRA rank r=6, scaling α=12, bfloat16 precision, and at most 200 validation samples per task.",
    253           "source": "opus"
    254         },
    255         "scaffolding_described": {
    256           "applies": false,
    257           "answer": false,
    258           "justification": "No agentic scaffolding is used. LORAUTER is a routing and composition framework, not an agentic system.",
    259           "source": "opus"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Section 4.1 and Appendix A.2 describe the dataset setup: 48 tasks from FLANV2, adapter training with Alpaca format, validation/test splits, and HuggingFace adapter collection criteria (rank ≤64 for LLaMA2-7B, yielding 1567 adapters).",
    265           "source": "opus"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "While the input datasets (FLANV2) and adapters are publicly available, the experimental outputs (per-query predictions, adapter selections, routing scores) are not released for independent verification.",
    273           "source": "opus"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Section 4.1 describes adapter training (48 adapters, one per FLANV2 task, with r=6, α=12, Alpaca format). Section 4.2 describes HuggingFace adapter collection (all LLaMA2-7B adapters with rank ≤64). Appendix A.2 details the dataset taxonomy and formatting.",
    279           "source": "opus"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants. Data sources are standard public benchmarks (FLANV2) and public adapter repositories (HuggingFace).",
    285           "source": "opus"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "The pipeline from task selection → adapter training → validation set construction → task representation building → evaluation is documented across Sections 3 and 4. Appendix A describes evaluation details including instruction formatting, data splits, and metric computation.",
    291           "source": "opus"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "LLaMA2's training data cutoff is not stated. The model was released in 2023 and could have been trained on data containing FLANV2 test examples (published 2022), but this is not discussed.",
    299           "source": "opus"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No discussion of whether FLANV2 test examples could have appeared in LLaMA2's pre-training data. While LoRA adapters are fine-tuned on training splits, the base model's exposure to test data is not addressed.",
    305           "source": "opus"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "FLANV2 was published in 2022 and LLaMA2 was trained on data collected through 2023, meaning benchmark examples were available online before training. This contamination risk is not discussed.",
    311           "source": "opus"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants in this study.",
    319           "source": "opus"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants in this study.",
    325           "source": "opus"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study.",
    331           "source": "opus"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "opus"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "opus"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "opus"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants in this study.",
    355           "source": "opus"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "Computational complexity is analyzed theoretically (O(T) vs O(N) vs O(NL) in Table 1) and evaluation budgets are compared for SH (Figure 5), but no actual wall-clock time, GPU hours, or monetary costs are reported for inference or routing.",
    363           "source": "opus"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "No total computational budget is stated. The paper does not report GPU hours, total training time for the 48 adapters, or compute used for evaluation. Hardware specifications are also absent.",
    369           "source": "opus"
    370         }
    371       },
    372       "experimental_rigor": {
    373         "seed_sensitivity_reported": {
    374           "applies": true,
    375           "answer": false,
    376           "justification": "Main routing results are not reported across multiple random seeds. The SH analysis (Section 4.5) uses 100 runs with random validation subsets, but the core performance claims (Tables 2, 3, 6) appear to be from a single configuration without seed sensitivity analysis.",
    377           "source": "opus"
    378         },
    379         "number_of_runs_stated": {
    380           "applies": true,
    381           "answer": false,
    382           "justification": "For SH analysis, '100 independent runs' is stated. For the main routing results, the number of runs is not stated. Task representation construction involves random sampling of validation queries, but whether main results represent a single draw or average over multiple is not specified.",
    383           "source": "opus"
    384         },
    385         "hyperparameter_search_budget": {
    386           "applies": true,
    387           "answer": false,
    388           "justification": "While Table 10 (Appendix B.6) ablates K values 1-5, no overall hyperparameter search budget is reported. The softmax temperature τ=0.2 appears fixed without justification of how it was selected.",
    389           "source": "opus"
    390         },
    391         "best_config_selection_justified": {
    392           "applies": true,
    393           "answer": true,
    394           "justification": "K=3 is selected based on the ablation in Table 10, which shows K=2 and K=3 perform nearly identically (101.8% vs 101.6%), with K=3 chosen for consistency with baselines. The paper transparently states this rationale.",
    395           "source": "opus"
    396         },
    397         "multiple_comparison_correction": {
    398           "applies": false,
    399           "answer": false,
    400           "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable.",
    401           "source": "opus"
    402         },
    403         "self_comparison_bias_addressed": {
    404           "applies": true,
    405           "answer": false,
    406           "justification": "The authors evaluate their own system (LORAUTER) against baselines they re-implemented (ARROW, SpectR) and against results copied from original papers (LORARETRIEVER). They do not acknowledge or discuss the systematic bias of authors evaluating their own system, as documented by Lucic et al. (2018).",
    407           "source": "opus"
    408         },
    409         "compute_budget_vs_performance": {
    410           "applies": true,
    411           "answer": true,
    412           "justification": "Figure 5 explicitly plots normalized performance as a function of total evaluations (compute budget) for SH vs uniform selection. Table 1 compares adapter selection overhead complexity (O(T) vs O(N) vs O(NL)) across all methods.",
    413           "source": "opus"
    414         },
    415         "benchmark_construct_validity": {
    416           "applies": true,
    417           "answer": false,
    418           "justification": "The paper uses FLANV2 with task-specific metrics and normalized averages but does not discuss whether this benchmark setup actually measures routing effectiveness, or whether normalized average is a valid aggregate measure of adapter routing quality.",
    419           "source": "opus"
    420         },
    421         "scaffold_confound_addressed": {
    422           "applies": false,
    423           "answer": false,
    424           "justification": "No scaffolding is involved in the evaluation. LORAUTER is a routing framework, not an agentic scaffold.",
    425           "source": "opus"
    426         }
    427       },
    428       "data_leakage": {
    429         "temporal_leakage_addressed": {
    430           "applies": true,
    431           "answer": false,
    432           "justification": "FLANV2 tasks were published in 2022 and LLaMA2 was trained on 2023 data. The model could have seen benchmark examples during pre-training. This temporal leakage is not discussed.",
    433           "source": "opus"
    434         },
    435         "feature_leakage_addressed": {
    436           "applies": true,
    437           "answer": false,
    438           "justification": "No discussion of whether the evaluation setup leaks information. For instance, the text encoder used for routing could capture features that implicitly reveal the task identity beyond what a realistic user query would contain.",
    439           "source": "opus"
    440         },
    441         "non_independence_addressed": {
    442           "applies": true,
    443           "answer": false,
    444           "justification": "No discussion of whether train and test examples within FLANV2 tasks share structural similarities or whether adapters trained on related tasks create dependencies.",
    445           "source": "opus"
    446         },
    447         "leakage_detection_method": {
    448           "applies": true,
    449           "answer": false,
    450           "justification": "No concrete leakage detection or prevention method is applied (no canary strings, membership inference, n-gram overlap analysis, or decontamination).",
    451           "source": "opus"
    452         }
    453       }
    454     }
    455   },
    456   "claims": [
    457     {
    458       "claim": "LORAUTER achieves 101.2% of Oracle task-aligned adapter performance in the non-OOD setting on LLaMA2-7B.",
    459       "evidence": "Figure 2 and Table 6 show 101.2% normalized average for LORAUTER vs. 100% Oracle on LLaMA2-7B non-OOD; Table 10 shows K=2 achieves 101.8%.",
    460       "supported": "strong"
    461     },
    462     {
    463       "claim": "LORAUTER outperforms the strongest baseline (LORARETRIEVER) by 5.2 percentage points in OOD settings.",
    464       "evidence": "Figure 2 shows LORAUTER at 88.4% vs. LORARETRIEVER at 83.2% for LLaMA2-7B OOD; difference is consistent across 13B model (86.8% vs. 85.9%).",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "LORAUTER scales to 1,567 noisy HuggingFace adapters while maintaining competitive performance (85.7% OOD) close to the curated 48-adapter pool (88.4%).",
    469       "evidence": "Table 8 reports OOD performance: 48 adapters = 88.4%, HF-only = 85.7%, HF+48 = 89.6% on LLaMA2-7B.",
    470       "supported": "strong"
    471     },
    472     {
    473       "claim": "Successive Halving reduces the adapter evaluation budget by more than 2x compared to uniform selection without sacrificing performance.",
    474       "evidence": "Figure 5 and Figure 10 show SH reaches near-peak performance at roughly half the evaluation budget of uniform selection, across both 7B and 13B models with 100 independent runs.",
    475       "supported": "strong"
    476     },
    477     {
    478       "claim": "Both retrieval and composition components independently contribute to LORAUTER's performance improvements.",
    479       "evidence": "Table 2 shows swapping only composition into LORARETRIEVER improves from 92.9% to 98.6% non-OOD; swapping only retrieval improves to 96.7%; combining both reaches 101.2%.",
    480       "supported": "moderate"
    481     },
    482     {
    483       "claim": "LORAUTER with pseudo-task clustering (K-Means) achieves performance comparable to routing with ground-truth task labels.",
    484       "evidence": "Section 4.4 reports best clustering configurations reach 101.5% (7B) and 99.3% (13B) non-OOD, vs. 101.2% and 98.8% with ground-truth labels.",
    485       "supported": "strong"
    486     }
    487   ],
    488   "methodology_tags": [
    489     "benchmark-eval"
    490   ],
    491   "key_findings": "LORAUTER is a training-free LoRA adapter routing framework that operates at the task level rather than directly mapping queries to adapters, using small validation sets to construct task representations and Successive Halving for efficient adapter-task pairing. On a 48-task FLANV2 benchmark with LLaMA2-7B/13B, LORAUTER achieves 101.2% of Oracle performance in-domain and outperforms the best baseline (LORARETRIEVER) by 5.2pp out-of-domain. The framework scales robustly to 1,500+ noisy public adapters with only 2.7pp performance degradation, and Successive Halving reduces adapter evaluation cost by over 2x. Both the task-level retrieval and input-aware weighted composition components independently contribute to performance gains over the LORARETRIEVER baseline.",
    492   "red_flags": [
    493     {
    494       "flag": "No code released",
    495       "detail": "No source code or repository link is provided, making independent reproduction infeasible despite detailed algorithmic description."
    496     },
    497     {
    498       "flag": "No significance tests on main results",
    499       "detail": "All comparative claims (e.g., +5.2pp over LORARETRIEVER) are made without statistical testing; test sets of 50 samples per task are small enough that differences may not be statistically reliable."
    500     },
    501     {
    502       "flag": "No limitations section",
    503       "detail": "The paper lacks any dedicated limitations or threats-to-validity section; the brief impact statement does not address methodological limitations."
    504     },
    505     {
    506       "flag": "No funding disclosure",
    507       "detail": "No funding source is acknowledged anywhere in the paper."
    508     },
    509     {
    510       "flag": "Benchmark contamination unaddressed",
    511       "detail": "FLANV2 benchmark tasks predate LLaMA2's training; the possibility that base model knowledge from pretraining inflates performance is never discussed."
    512     },
    513     {
    514       "flag": "Disadvantaged LORAHUB baseline",
    515       "detail": "LORAHUB is evaluated with only 20 randomly sampled adapters from the pool (per Appendix A), while LORAUTER uses all 48; this may not be a fair comparison."
    516     }
    517   ],
    518   "cited_papers": [
    519     {
    520       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    521       "relevance": "Foundation method that LORAUTER builds on; defines the adapter format and parameter-efficient fine-tuning approach."
    522     },
    523     {
    524       "title": "LoraRetriever: Input-aware LoRA Retrieval and Composition for Mixed Tasks in the Wild",
    525       "relevance": "Primary baseline and direct predecessor; LORAUTER's main experimental comparison and problem formulation follows Zhao et al. 2024."
    526     },
    527     {
    528       "title": "Lorahub: Efficient Cross-Task Generalization via Dynamic LoRA Composition",
    529       "relevance": "Key baseline method that learns fusion weights; LORAUTER's composition builds on but improves over this approach."
    530     },
    531     {
    532       "title": "Towards Modular LLMs by Building and Reusing a Library of LoRAs (ARROW)",
    533       "relevance": "Competitive baseline using spectral routing; provides O(NL) alternative approach that LORAUTER outperforms."
    534     },
    535     {
    536       "title": "Finetuned Language Models Are Zero-Shot Learners (FLAN/FLANV2)",
    537       "relevance": "Source of the 48-task evaluation benchmark and adapter training tasks used throughout all experiments."
    538     },
    539     {
    540       "title": "AdapterSoup: Weight Averaging to Improve Generalization of Pretrained Language Models",
    541       "relevance": "Prior adapter composition method requiring training data; represents one approach LORAUTER supersedes."
    542     },
    543     {
    544       "title": "SpectR: Dynamically Composing LM Experts with Spectral Routing",
    545       "relevance": "Contemporary spectral routing baseline included in evaluations."
    546     },
    547     {
    548       "title": "Non-stochastic Best Arm Identification and Hyperparameter Optimization (Successive Halving)",
    549       "relevance": "Algorithm used in LORAUTER for efficient adapter selection; reduces evaluation budget by >2x."
    550     }
    551   ],
    552   "engagement_factors": {
    553     "practical_relevance": {
    554       "score": 2,
    555       "justification": "Directly applicable to practitioners using public HuggingFace LoRA adapter pools, with training-free deployment and tested on 1500+ real-world adapters."
    556     },
    557     "surprise_contrarian": {
    558       "score": 1,
    559       "justification": "The task-level intermediary insight is intuitive and presented straightforwardly; it does not strongly challenge conventional wisdom in the field."
    560     },
    561     "fear_safety": {
    562       "score": 0,
    563       "justification": "No AI safety concerns raised; impact statement briefly mentions inherited biases but the paper poses no novel safety risks."
    564     },
    565     "drama_conflict": {
    566       "score": 0,
    567       "justification": "Standard ML systems paper with no controversy or conflict angle."
    568     },
    569     "demo_ability": {
    570       "score": 2,
    571       "justification": "The method uses only public HuggingFace adapters and models; a practitioner could implement LORAUTER with the paper's description, though no code is released."
    572     },
    573     "brand_recognition": {
    574       "score": 1,
    575       "justification": "EPFL is a respected research institution but not a top-tier industry AI lab; no involvement from Meta, Google, OpenAI, or similar high-recognition organizations."
    576     }
    577   },
    578   "hn_data": {
    579     "threads": [],
    580     "top_points": 0,
    581     "total_points": 0,
    582     "total_comments": 0
    583   }
    584 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs