ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25370B)


      1 {
      2   "paper": {
      3     "title": "CMoE: Converting Mixture-of-Experts from Dense to Accelerate LLM Inference",
      4     "authors": [
      5       "Zehua Pei",
      6       "Lancheng Zou",
      7       "Hui-Ling Zhen",
      8       "Xianzhi Yu",
      9       "Wulong Liu",
     10       "Sinno Jialin Pan",
     11       "Mingxuan Yuan",
     12       "Bei Yu"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv preprint",
     16     "arxiv_id": "2502.04416"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper states in the abstract: 'We make our code publicly available at https://github.com/JarvisPei/CMoE.' A GitHub URL is provided."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses publicly available datasets: WikiText-2, C4, SlimPajama, and standard benchmarks (BoolQ, PIQA, SciQ, WinoGrande, ARC-Challenge, HellaSwag). No custom datasets were created."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions using Hugging Face Transformers and PyTorch but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "While the paper provides a GitHub link, the paper itself does not contain step-by-step reproduction instructions, a README summary, or scripts to replicate the main experiments. The methodology is described algorithmically but not as reproduction steps."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables 1-5 are reported as point estimates (single perplexity values, single accuracy numbers) with no confidence intervals or error bars."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims CMoE outperforms LLaMA-MoE and LLaMA-MoE-v2 based solely on comparing raw numbers in Tables 1 and 2, with no statistical significance tests."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper reports improvements with baseline context throughout: e.g., '1.5x latency reduction at 25% activation', PPL values alongside the dense model baseline (5.27 vs 7.02 for CMoE 75%), and percentage of dense model accuracy recovered ('>76%'). Table 3 reports speedup ratios. Absolute baseline and result numbers are consistently provided."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The calibration dataset uses only 8 examples (2048 sequence length), and fine-tuning uses 2,048 samples. These choices are not justified by any formal analysis. Appendix B shows performance with different calibration sizes (8, 32, 64) but does not justify why 8 was selected as the default."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No standard deviations, variance across seeds, or multiple-run results are reported anywhere in the paper. All results appear to be from single runs."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper compares against the original dense models (Llama-2 7B, Llama-3 8B), LLaMA-MoE, and LLaMA-MoE-v2 as baselines in Tables 1 and 2. Table 4 also compares against Pythia-1.0B and TinyLlama-1.1B."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "LLaMA-MoE (EMNLP 2024) and LLaMA-MoE-v2 (arXiv 2024) are recent and represent state-of-the-art approaches for dense-to-MoE conversion. The comparison is appropriate."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Section 4.2 provides ablation studies on: (1) fine-tuning data size (Figure 2), (2) load balancing (Figure 3), (3) expert configuration at fixed activation ratio (Table 5), (4) calibration data size (Appendix B), and (5) scaling to larger models (Appendix C)."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper uses perplexity on WikiText-2 and C4, downstream task accuracy on 6 benchmarks (BoolQ, PIQA, SciQ, WinoGrande, ARC-Challenge, HellaSwag), and inference speedup measurements."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Human evaluation is not relevant to this work. The paper evaluates a model compression/conversion technique using automated metrics (perplexity, benchmark accuracy, inference speedup), which is standard and appropriate."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Perplexity is measured on WikiText-2 and C4 validation sets (Table 1 explicitly states 'validation sets'). Fine-tuning is done on WikiText-2 training data. Downstream benchmarks are standard held-out evaluations."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table 2 provides per-benchmark breakdowns across 6 tasks rather than just aggregate performance. Table 5 provides per-configuration breakdowns. Multiple activation ratios (25%, 75%) are separately reported."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper acknowledges that training-free CMoE at 25% activation has poor absolute perplexity (62.30 on WikiText-2) and underperforms pre-trained dense-small models (Table 4). The limitations section in Appendix D discusses cases where the approach may not work well."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Table 4 explicitly shows training-free CMoE-25% underperforms Pythia-1.0B and TinyLlama-1.1B. The S1A3E16 configuration with more complex routing shows worse performance than simpler configurations (Table 5). Diminishing returns from fine-tuning data are also reported."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims are supported: 'lossless precision in terms of perplexity' at 75% activation is supported by Table 1 (CMoE TF 7.02 vs Dense 5.27 on WikiText-2 for Llama-2 7B — this is close but not exactly lossless, though the claim is hedged with 'in terms of perplexity'). '1.5x latency reduction' is supported by Table 3. '>76% of dense model's downstream accuracy' is supported by Table 2 data."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper makes causal claims through ablation studies (removing/modifying components). The balanced assignment algorithm, router construction, and load balancing are each evaluated through controlled experiments. The ablation design uses single-variable manipulation (e.g., varying fine-tuning data size, expert configurations)."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title says 'Accelerate LLM Inference' broadly, but experiments are limited to Llama-2 7B, Llama-2 13B, and Llama-3 8B. The conclusion states 'CMoE offers a practical approach for deploying large language models' without bounding to the tested models. No experiments on non-Llama architectures or significantly larger models."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for its results. For example, it does not consider whether the improvements over LLaMA-MoE/LLaMA-MoE-v2 could stem from implementation differences rather than algorithmic superiority, or whether the calibration dataset (WikiText-2) biases the perplexity results on WikiText-2."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper specifies exact model names: 'Llama-2 7B', 'Llama-3 8B', 'Llama-2 13B'. These are specific model checkpoints from Meta with known architectures and weights, not ambiguous model names."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "This paper does not use prompting. It is about model architecture conversion (dense to MoE), not about querying language models with prompts."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 4 reports: LoRA rank 8, alpha 32, optimizer Adam (beta1=0.9, beta2=0.95), learning rates (0.001 for router, 5.95e-5 for LoRA), load balancing bias update speed gamma=0.001, Ka=10 for activation status, calibration uses 8 examples of 2048 sequence length."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. This is a model architecture conversion method, not an agent-based system."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 4 describes: calibration uses 8 randomly selected examples from WikiText-2 with 2048 sequence length. Fine-tuning uses 2,048 WikiText-2 samples for 1 epoch. Extended fine-tuning uses subsets of SlimPajama up to 1.2B tokens. The selection and usage of data at each stage is described."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Appendix D contains a 'Limitations' paragraph that discusses dependence on calibration dataset choice, potential limitations of the training-free router, and the need for future exploration across more architectures and tasks."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The limitations section mentions specific threats: (1) 'performance of neuron activation profiling and subsequent expert grouping is influenced by the choice of the calibration dataset', (2) the training-free router 'may not fully capture all nuances of routing decisions' compared to end-to-end trained routers, and (3) need for validation 'across a wider array of model architectures'."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what it does NOT show. The limitations mention future work directions but do not explicitly bound the scope (e.g., 'our results do not demonstrate X'). The broad claims in the conclusion ('CMoE offers a practical approach for deploying large language models') are not bounded to the tested settings."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw experimental data (per-example perplexity values, individual benchmark outputs, activation statistics) is made available. Only aggregated results in tables are provided."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The data collection procedure is described: randomly selected samples from WikiText-2 for calibration and fine-tuning, standard benchmark datasets for evaluation, and SlimPajama for extended fine-tuning. The datasets are all public and well-known."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. All data comes from standard public benchmarks (WikiText-2, C4, SlimPajama, BoolQ, PIQA, SciQ, WinoGrande, ARC-Challenge, HellaSwag)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline is documented: (1) calibration data collected from WikiText-2, (2) neuron activation profiling using ATopK metric, (3) expert grouping via balanced assignment, (4) router construction from activation statistics, (5) optional LoRA fine-tuning. Section 3 and Section 4 describe each step."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding sources or acknowledgments section is present in the paper. Authors are from CUHK and Huawei Noah's Ark Lab, but no explicit funding disclosure is provided."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: 'The Chinese University of Hong Kong' and 'Noah's Ark Lab, Huawei'. The Huawei affiliation is transparently disclosed."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding disclosure is present, so independence cannot be assessed. Authors from Huawei (a company that deploys LLMs) have a potential commercial interest in efficient LLM inference techniques, but no funding or conflict statement is provided to clarify the relationship."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper. Given that Huawei is a major technology company, the absence of any declaration is a gap."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper evaluates Llama-2 and Llama-3 on standard benchmarks but does not state the training data cutoff dates for these models. This is relevant because perplexity on WikiText-2 and downstream benchmark accuracy could be affected by data contamination."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether WikiText-2, C4, or the downstream benchmarks appeared in Llama-2/Llama-3 training data. This is a standard concern for these models and benchmarks."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The benchmarks used (BoolQ, PIQA, WinoGrande, ARC-Challenge, HellaSwag, SciQ) were all published before the training cutoff of Llama-2 and Llama-3. No contamination analysis is performed. However, since the paper compares different conversion methods applied to the same base model, contamination would affect all methods equally, somewhat mitigating the concern for relative comparisons."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Table 3 reports inference speedups for different configurations (e.g., 1.5x-1.6x full model speedup for 25% activation). Figure 2 shows construction time. The paper also states 'conversion completes in under five minutes on a single GPU'."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "The paper states the conversion completes 'in under five minutes on a single GPU'. Fine-tuning takes '1 hour' for the 2k sample LoRA fine-tuning. Figure 2 shows construction time scaling with data size. While GPU type is not specified, the time budget is stated."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "At 75% activation ratio, CMoE achieves lossless precision in terms of perplexity while maintaining 5% acceleration.",
    295       "evidence": "Table 1 shows CMoE 75% training-free achieves 7.02 PPL on WikiText-2 vs 5.27 for dense Llama-2 7B (33% higher, not truly lossless). Table 3 shows S3A3E8 (75% activation) achieves 1.05-1.12x full model speedup.",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "At 25% activation, CMoE reduces end-to-end latency by 1.5x while preserving usable perplexity without additional training.",
    300       "evidence": "Table 3 shows S1A1E8 (25% activation) achieves 1.5-1.6x full model speedup. Table 1 shows training-free PPL of 62.30 on WikiText-2 (vs 5.27 for dense), which is degraded but functional.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "A brief LoRA fine-tuning process (1 hour, 2,000 samples) recovers over 76% of the dense model's downstream accuracy.",
    305       "evidence": "Table 2 shows CMoE 25% FT on Llama-2 7B achieves 55.04 BoolQ (vs 82.04 dense = 67%), 77.50 SciQ (vs 90.80 = 85%), 57.12 PIQA (vs 78.78 = 72%). The >76% claim appears to cherry-pick SciQ. Average across all 6 tasks: ~50.68/76.21 = ~66.5% recovery.",
    306       "supported": "weak"
    307     },
    308     {
    309       "claim": "CMoE substantially outperforms LLaMA-MoE and LLaMA-MoE-v2 in training-free and lightweight fine-tuning settings.",
    310       "evidence": "Table 1 shows LLaMA-MoE and LLaMA-MoE-v2 produce NaN or >10k PPL in training-free mode at 25% activation, while CMoE achieves 62.30. After 2k fine-tuning, CMoE PPL is 12.73 vs 468.00 (LLaMA-MoE) and 17.72 (LLaMA-MoE-v2). Table 2 downstream results confirm the advantage.",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "With extended fine-tuning (1.2B tokens SlimPajama), CMoE-25% surpasses Pythia-1.0B and approaches TinyLlama-1.1B.",
    315       "evidence": "Table 4 shows CMoE-25% with 1.2B SlimPajama achieves 61.64 PIQA (vs 69.21 Pythia, 73.29 TinyLlama), 59.51 BoolQ (vs 57.83 both), 44.02 HellaSwag (vs 47.16 Pythia, 59.20 TinyLlama). CMoE surpasses Pythia on BoolQ and Winogrande but not on PIQA and HellaSwag.",
    316       "supported": "moderate"
    317     }
    318   ],
    319   "methodology_tags": [
    320     "benchmark-eval"
    321   ],
    322   "key_findings": "CMoE converts dense LLMs (Llama-2 7B, Llama-3 8B) into Mixture-of-Experts architectures without retraining, using neuron activation profiling and balanced assignment clustering. At 75% activation ratio, the conversion achieves near-dense perplexity with modest speedup (5-12%). At 25% activation, it achieves 1.5x speedup but with significant quality degradation that lightweight LoRA fine-tuning partially recovers. CMoE substantially outperforms prior dense-to-MoE conversion methods (LLaMA-MoE, LLaMA-MoE-v2) which fail catastrophically in training-free settings.",
    323   "red_flags": [
    324     {
    325       "flag": "No variance or uncertainty quantification",
    326       "detail": "All results are reported as single-run point estimates with no standard deviations, confidence intervals, or multiple-seed experiments. Given the sensitivity of MoE routing to initialization and calibration data selection, this is a significant omission."
    327     },
    328     {
    329       "flag": "Overstated lossless claim",
    330       "detail": "The abstract and conclusion claim 'lossless precision in terms of perplexity' at 75% activation, but Table 1 shows WikiText-2 PPL of 7.02 vs 5.27 for dense Llama-2 7B (33% higher) and 10.16 vs 6.14 for Llama-3 8B (65% higher). This is a substantial gap presented as negligible."
    331     },
    332     {
    333       "flag": "Cherry-picked accuracy recovery claim",
    334       "detail": "The '>76% of dense model downstream accuracy' claim appears to be based on the best-performing benchmark (SciQ). Averaging across all 6 benchmarks for Llama-2 7B at 25% activation gives approximately 66.5% recovery, not >76%."
    335     },
    336     {
    337       "flag": "Potential calibration data leakage",
    338       "detail": "WikiText-2 is used for both calibration/fine-tuning and perplexity evaluation (on the validation set). While the paper uses the validation set for evaluation and training set for calibration/fine-tuning, this domain overlap could inflate perplexity results compared to out-of-domain evaluation."
    339     },
    340     {
    341       "flag": "Missing GPU/hardware specification for speedup measurements",
    342       "detail": "Table 3 reports inference speedups but does not specify the GPU model used for benchmarking. Speedup ratios can vary significantly across hardware, making these results difficult to verify or compare."
    343     }
    344   ],
    345   "cited_papers": [
    346     {
    347       "title": "Llama-MoE: Building Mixture-of-Experts from LLaMA with Continual Pre-training",
    348       "authors": ["Tong Zhu", "Xiaoye Qu", "Daize Dong", "Jiacheng Ruan", "Jingqi Tong", "Conghui He", "Yu Cheng"],
    349       "year": 2024,
    350       "relevance": "Primary baseline for dense-to-MoE conversion using continual pre-training, demonstrating the standard approach CMoE improves upon."
    351     },
    352     {
    353       "title": "LLaMA-MoE v2: Exploring Sparsity of LLaMA from Perspective of Mixture-of-Experts with Post-Training",
    354       "authors": ["Xiaoye Qu", "Daize Dong", "Xuyang Hu", "Tong Zhu", "Weigao Sun", "Yu Cheng"],
    355       "year": 2024,
    356       "arxiv_id": "2411.15708",
    357       "relevance": "Second primary baseline using importance-based neuron partition and post-training for dense-to-MoE conversion."
    358     },
    359     {
    360       "title": "DeepSeek-V3 Technical Report",
    361       "authors": ["Aixin Liu", "Bei Feng", "Bing Xue"],
    362       "year": 2024,
    363       "arxiv_id": "2412.19437",
    364       "relevance": "State-of-the-art MoE architecture whose load balancing mechanism CMoE adopts for its differentiable routing."
    365     },
    366     {
    367       "title": "DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models",
    368       "authors": ["Damai Dai", "Chengqi Deng", "Chenggang Zhao"],
    369       "year": 2024,
    370       "arxiv_id": "2401.06066",
    371       "relevance": "Influential MoE architecture design with shared and routed experts, which CMoE's architecture is modeled after."
    372     },
    373     {
    374       "title": "MoEfication: Transformer Feed-Forward Layers are Mixtures of Experts",
    375       "authors": ["Zhengyan Zhang", "Yankai Lin", "Zhiyuan Liu", "Peng Li", "Maosong Sun", "Jie Zhou"],
    376       "year": 2021,
    377       "arxiv_id": "2110.01786",
    378       "relevance": "Foundational work demonstrating that FFN layers can be decomposed into experts based on activation sparsity patterns."
    379     },
    380     {
    381       "title": "Deja Vu: Contextual Sparsity for Efficient LLMs at Inference Time",
    382       "authors": ["Zichang Liu", "Jue Wang", "Tri Dao"],
    383       "year": 2023,
    384       "relevance": "Prior work on exploiting contextual sparsity in FFN layers for efficient LLM inference, directly related to CMoE's activation sparsity observations."
    385     },
    386     {
    387       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    388       "authors": ["Edward J Hu", "Yelong Shen", "Phillip Wallis"],
    389       "year": 2021,
    390       "arxiv_id": "2106.09685",
    391       "relevance": "Parameter-efficient fine-tuning method used by CMoE for optional lightweight adaptation after conversion."
    392     },
    393     {
    394       "title": "Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity",
    395       "authors": ["William Fedus", "Barret Zoph", "Noam Shazeer"],
    396       "year": 2022,
    397       "relevance": "Foundational MoE architecture work establishing routing and expert selection paradigms that CMoE builds upon."
    398     },
    399     {
    400       "title": "Sparse Upcycling: Training Mixture-of-Experts from Dense Checkpoints",
    401       "authors": ["Aran Komatsuzaki", "Joan Puigcerver", "James Lee-Thorp"],
    402       "year": 2022,
    403       "arxiv_id": "2212.05055",
    404       "relevance": "Alternative approach to dense-to-MoE conversion via capacity expansion, representing the second paradigm CMoE contrasts against."
    405     },
    406     {
    407       "title": "Learn to be Efficient: Build Structured Sparsity in Large Language Models",
    408       "authors": ["Haizhong Zheng", "Xiaoyan Bai", "Xueshen Liu"],
    409       "year": 2024,
    410       "arxiv_id": "2402.06126",
    411       "relevance": "Training-time approach to learning activation sparsity in LLMs, complementary to CMoE's post-hoc conversion approach."
    412     }
    413   ]
    414 }

Impressum · Datenschutz