ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26837B)


      1 {
      2   "paper": {
      3     "title": "KnapSpec: Self-Speculative Decoding via Adaptive Layer Selection as a Knapsack Problem",
      4     "authors": [
      5       "Seongjin Cha",
      6       "Gyuwan Kim",
      7       "Dongsu Han",
      8       "Tao Yang",
      9       "Insu Han"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv",
     13     "arxiv_id": "2602.20217"
     14   },
     15   "scan_version": 2,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval", "theoretical"],
     18   "key_findings": "KnapSpec reformulates self-speculative decoding layer selection as a 0/1 knapsack problem, decoupling Attention and MLP layers to account for their asymmetric, context-length-dependent latencies. The method achieves up to 1.47x wall-clock speedup across Qwen3 and Llama3 models (1B-70B) on reasoning and summarization tasks, consistently outperforming prior SSD baselines. The proposed TPT (Tokens-per-Time) metric correlates much more strongly with actual throughput (r=0.837) than acceptance rate alone (r=0.538), and a formal proof establishes cosine similarity as a sound proxy for greedy acceptance rate.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "All evaluation benchmarks are publicly available: AIME24/25, MMLU-Pro, GovReport, PG19, and BookSum are standard public datasets cited with references."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, library versions, or environment specifications are provided. The hardware used for experiments is never specified despite the method being 'hardware-aware'."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The algorithm is described (Algorithms 1-2) but implementation details for reproduction are absent."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Table 2 reports only point estimates for TPT, speedup, and acceptance rate across all models and datasets. No confidence intervals, error bars, or ± notation anywhere."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims KnapSpec 'consistently outperforms' all baselines but no statistical significance tests are performed. All comparisons are based on raw number comparisons."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Speedup ratios (e.g., 1.47x, 1.43x) are reported relative to autoregressive baseline (1.00x), and absolute TPT values provide baseline context. Figure 2 reports R² values (0.700 vs 0.290) for correlation analysis."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification for why these specific benchmarks or dataset sizes were chosen. No mention of how many examples from each benchmark were used in evaluation."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "All results appear to be from single runs. No standard deviations, variance across seeds, or spread measures are reported in any table or figure."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Compares against autoregressive decoding (AR), DEL, SWIFT, and CLaSp baselines across all experiments (Table 2). Table 1 provides a feature comparison."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "All baselines are recent: SWIFT (2024), CLaSp (2025), DEL (2025). These represent the current state of the art in training-free self-speculative decoding."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 5.2 provides ablation studies: TPT vs acceptance rate correlation (Figure 2), adaptive layer selection ratio analysis (Figure 3), cosine similarity pruning threshold analysis (Figure 4), and optimization interval trade-off (Figure 6)."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Reports three metrics in Table 2: TPT (tokens-per-time), speedup (wall-clock), and acceptance rate. Also reports Pearson correlation and R² in Figure 2."
     89       },
     90       "human_evaluation": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "The paper evaluates inference speed of speculative decoding. The verification step guarantees the output distribution matches the target model exactly (for greedy decoding), so output quality is identical by construction. Human evaluation of output quality is irrelevant."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Uses standard public benchmarks (AIME24/25, MMLU-Pro, GovReport, PG19, BookSum) which have established test sets. The method has no training phase so there is no dev/test contamination concern within their pipeline."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 2 provides per-dataset and per-model-size breakdowns across 6 datasets and 8 model sizes. Figure 5 shows per-layer skip probabilities. Figure 3 shows per-layer-type (Attention vs MLP) analysis."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "The paper notes DEL fails to accelerate base models, but does not discuss any failure cases or limitations of KnapSpec itself. All reported results show KnapSpec winning."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Figure 4 shows that cosine similarity threshold τ > 0.5 causes throughput decline. Figure 6 shows that optimization intervals of 4-32 steps cause excessive overhead, and intervals > 128 cause stale configurations. DEL is shown to degrade performance below autoregressive baseline."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims 'up to 1.47x wall-clock speedup' is supported by Table 2 (GovReport, Llama3.1-70B: 1.47x). 'Consistently outperforms state-of-the-art SSD baselines' is supported across all Table 2 entries. Theoretical claim about cosine similarity is supported by Lemma 4.1."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims (e.g., decoupling layers 'enables' adaptive optimization) are supported by ablation studies. Figure 3 shows the causal mechanism (attention skipping increases with context length). The knapsack formulation is shown to produce better TPT configurations via controlled comparisons."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Abstract states 'Our experiments on Qwen3 and Llama3 demonstrate...' bounding claims to tested models. Claims are consistently qualified to the tested benchmarks and model families."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "No discussion of alternative explanations for the observed speedups. For example, they don't consider whether the gains could be due to specific architectural properties of Qwen3/Llama3, benchmark-specific effects, or implementation differences with baselines."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Section 5.2 explicitly demonstrates that acceptance rate is an insufficient proxy for actual throughput (Figure 2, r=0.538 vs TPT r=0.837). The paper carefully distinguishes between proxy metrics (TPT, acceptance rate) and the actual outcome (wall-clock throughput)."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Specific open-weight model versions are named: Qwen3-32B, Qwen3-14B, Qwen3-8B, Qwen3-4B, Llama3.1-70B, Llama3.1-8B, Llama3.2-3B, Llama3.2-1B. These identify specific released model weights."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper runs inference on AIME24/25, MMLU-Pro, and summarization benchmarks but provides no prompt templates or formatting instructions used to present these tasks to the models."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Key hyperparameters stated: cosine similarity threshold τ = 0.5, confidence threshold τ_conf = 0.7, maximum draft length D = 10, cached token window m = 5 speculation steps, optimization interval T = 64/128 steps, greedy decoding. Generation lengths: 32K for AIME, 4K for MMLU-Pro."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. KnapSpec is a decoding acceleration method, not an agent-based system."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "No description of how benchmark inputs were formatted or preprocessed. Maximum generation lengths and average input lengths are stated, but no details on input formatting, tokenization, or data preparation pipeline."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No limitations section exists. The paper ends with a brief Conclusion (Section 6) and a boilerplate Impact Statement. No substantive discussion of limitations anywhere."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No threats to validity are discussed. No mention of potential hardware-specific biases, model-architecture dependencies, or limitations of the theoretical assumptions (e.g., equal-norm assumption in Lemma 4.1)."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No explicit scope boundaries stated. The paper does not discuss what settings the method has NOT been tested on, such as non-greedy decoding, different hardware, non-Transformer architectures, or very small/very large context lengths."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No raw experimental data (timing measurements, per-example results, layer selection logs) is released. Only aggregated results in tables and figures."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Benchmark sources are cited with references: AIME24/25 (Zhang & Math-AI, 2024), MMLU-Pro (Wang et al., 2024), GovReport (Huang et al., 2021), PG19 (Rae et al., 2020), BookSum (Kryściński et al., 2022). Evaluation setup parameters are described in Section 5."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. All data sources are standard public benchmarks."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "The method pipeline is documented (Algorithms 1-2), but the experimental evaluation pipeline (how benchmarks were loaded, how timing was measured, warmup procedures, etc.) is not described."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding sources are mentioned anywhere in the paper. No acknowledgments section is present."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are listed: KAIST School of Electrical Engineering and UC Santa Barbara Department of Computer Science. These are academic institutions."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding statement makes this unanswerable."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "The paper evaluates a decoding acceleration method, not the models' capability on benchmarks. All compared methods use the same target model, so contamination would affect all methods equally and is irrelevant to the speedup claims."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Same rationale: the paper measures inference speed, not task performance. Train/test overlap does not affect the comparative speedup measurements."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "Benchmark contamination is irrelevant to the paper's claims about decoding throughput acceleration."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in the study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Wall-clock speedup ratios and absolute TPT values are reported for all experiments in Table 2. Optimization overhead percentages are reported in Figure 6. The method explicitly optimizes for inference throughput."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No hardware specification (GPU model, memory) is provided despite the method being 'hardware-aware'. No total compute budget, experiment duration, or resource requirements are stated."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No mention of random seeds or seed sensitivity analysis. All results appear to be from single runs."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The number of experimental runs is never stated. Results in Table 2 are presented without indicating whether they are from single or multiple runs."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "While ablation studies explore τ (Figure 4) and optimization interval (Figure 6), no systematic hyperparameter search budget is reported for selecting the final values of τ=0.5, τ_conf=0.7, D=10, or m=5."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Parameter selection is justified through ablation: τ=0.5 selected via Figure 4 analysis (best balance of speed and memory), optimization interval of 64 justified via Figure 6 (peak throughput). The method itself dynamically selects optimal configurations via DP."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper makes many comparative claims across 6 datasets, 8 model sizes, and 4 baselines without any statistical testing, let alone multiple comparison correction."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No acknowledgment that the authors are implementing and comparing against their own implementations of baselines. No independent evaluation or discussion of potential bias in baseline implementations."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Figure 6 directly shows throughput as a function of optimization overhead. The TPT metric inherently normalizes performance by compute cost. Section 3.6 analyzes runtime and memory complexity."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "No discussion of whether AIME24/25, MMLU-Pro, GovReport, PG19, or BookSum are appropriate benchmarks for evaluating speculative decoding performance. No analysis of whether these benchmarks represent realistic inference workloads."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is involved. KnapSpec is a decoding-level optimization, not an agent or scaffold."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of temporal leakage. While the paper measures speed rather than task accuracy, model familiarity with benchmark data could affect token prediction confidence and thus speculative decoding acceptance rates."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of feature leakage in the evaluation setup."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of independence between training data and benchmark data for the models used."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention methods are applied."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "KnapSpec achieves up to 1.47x wall-clock speedup over autoregressive decoding across various benchmarks.",
    370       "evidence": "Table 2 shows 1.47x speedup on GovReport with Llama3.1-70B, and consistent speedups of 1.06x-1.47x across all model-dataset combinations (Section 5.1).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "TPT metric correlates much more strongly with actual throughput than acceptance rate (PCC 0.837 vs 0.538).",
    375       "evidence": "Figure 2 shows Pearson correlation coefficients and R² values (0.700 vs 0.290) for TPT vs acceptance rate against measured throughput on Llama-3.1-8B/GovReport (Section 5.2).",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "KnapSpec consistently outperforms all state-of-the-art training-free SSD baselines (SWIFT, CLaSp, DEL).",
    380       "evidence": "Table 2 shows KnapSpec achieves highest TPT and speedup across all 24 model-dataset configurations tested (Section 5.1).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Cosine similarity is a mathematically sound proxy for the token acceptance rate in greedy decoding.",
    385       "evidence": "Lemma 4.1 (Section 4) provides a formal proof with the equal-norm assumption, showing that sufficiently high cosine similarity guarantees identical token selection. Proof in Appendix A.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "KnapSpec adaptively skips more Attention layers as context length increases, responding to the growing latency asymmetry.",
    390       "evidence": "Figure 3 shows KnapSpec increasing Attention layer skipping from ~7 to ~15 layers as context grows from 1K to 29K tokens, while SWIFT's selection remains relatively static (Section 5.2).",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Cosine similarity pruning threshold τ=0.5 reduces memory by 31% without sacrificing speedup.",
    395       "evidence": "Figure 4 shows throughput and memory reduction percentages across τ values. At τ=0.5, speedup matches the no-pruning case while memory consumption is reduced by 31% (Section 5.2).",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No error bars or variance reporting",
    402       "detail": "All 24 model-dataset configurations in Table 2 report single point estimates. Without variance across runs, it is impossible to assess whether the differences between methods are statistically meaningful or due to measurement noise."
    403     },
    404     {
    405       "flag": "Hardware never specified",
    406       "detail": "The paper emphasizes 'hardware-aware' latency modeling as a core contribution but never states what GPU or hardware was used for any experiment. This makes results impossible to reproduce and raises questions about generalizability to other hardware."
    407     },
    408     {
    409       "flag": "No limitations section",
    410       "detail": "The paper has no discussion of limitations, failure modes, or conditions under which KnapSpec might not provide speedups. All 24 configurations show KnapSpec winning, which combined with no variance reporting is unusually clean."
    411     },
    412     {
    413       "flag": "Theoretical assumption not empirically validated",
    414       "detail": "Lemma 4.1 assumes ||x||₂ = ||x'||₂ (equal norms between target and draft embeddings). The paper states this is 'empirically supported by modern architectures (e.g., RMSNorm)' but provides no empirical measurement of how well this assumption holds for the tested models."
    415     }
    416   ],
    417   "cited_papers": [
    418     {
    419       "title": "Fast inference from transformers via speculative decoding",
    420       "authors": ["Y. Leviathan", "M. Kalman", "Y. Matias"],
    421       "year": 2023,
    422       "relevance": "Foundational work on speculative decoding for LLM inference acceleration, core technique that KnapSpec builds upon."
    423     },
    424     {
    425       "title": "Accelerating large language model decoding with speculative sampling",
    426       "authors": ["C. Chen", "S. Borgeaud", "G. Irving"],
    427       "year": 2023,
    428       "arxiv_id": "2302.01318",
    429       "relevance": "Complementary foundational work on speculative sampling for LLM acceleration."
    430     },
    431     {
    432       "title": "LayerSkip: Enabling early exit inference and self-speculative decoding",
    433       "authors": ["M. Elhoushi", "A. Shrivastava", "D. Liskovich"],
    434       "year": 2024,
    435       "relevance": "Training-based self-speculative decoding baseline using early-exit and layer dropout."
    436     },
    437     {
    438       "title": "SWIFT: On-the-fly self-speculative decoding for LLM inference acceleration",
    439       "authors": ["H. Xia", "Y. Li", "J. Zhang"],
    440       "year": 2024,
    441       "arxiv_id": "2410.06916",
    442       "relevance": "Key baseline using Bayesian optimization for training-free layer selection in self-speculative decoding."
    443     },
    444     {
    445       "title": "CLaSp: In-context layer skip for self-speculative decoding",
    446       "authors": ["L. Chen", "R. Shan", "H. Wang"],
    447       "year": 2025,
    448       "arxiv_id": "2505.24196",
    449       "relevance": "Key baseline using dynamic programming for layer selection, closest prior work to KnapSpec."
    450     },
    451     {
    452       "title": "DEL: Context-aware dynamic exit layer for efficient self-speculative decoding",
    453       "authors": ["H. E. Zarch", "L. Gao", "C. Jiang", "M. Annavaram"],
    454       "year": 2025,
    455       "arxiv_id": "2504.05598",
    456       "relevance": "Dynamic early-exit SSD baseline that optimizes the Tokens-per-Layer metric."
    457     },
    458     {
    459       "title": "Eagle-2: Faster inference of language models with dynamic draft trees",
    460       "authors": ["Y. Li", "F. Wei", "C. Zhang", "H. Zhang"],
    461       "year": 2024,
    462       "relevance": "Dynamic draft tree approach for speculative decoding with confidence-based early termination."
    463     },
    464     {
    465       "title": "Kangaroo: Lossless self-speculative decoding via double early exiting",
    466       "authors": ["F. Liu", "Y. Tang", "Z. Liu"],
    467       "year": 2024,
    468       "relevance": "Training-based self-speculative decoding using double early-exit with adapter module."
    469     },
    470     {
    471       "title": "Draft on the fly: Adaptive self-speculative decoding using cosine similarity",
    472       "authors": ["M. R. Metel", "P. Lu", "B. Chen"],
    473       "year": 2024,
    474       "relevance": "Adaptive self-speculative decoding using cosine similarity as a heuristic for draft quality, which KnapSpec formalizes theoretically."
    475     },
    476     {
    477       "title": "The Llama 3 herd of models",
    478       "authors": ["A. Grattafiori", "A. Dubey"],
    479       "year": 2024,
    480       "arxiv_id": "2407.21783",
    481       "relevance": "LLM model family (Llama3 1B-70B) used for evaluation in summarization experiments."
    482     },
    483     {
    484       "title": "Qwen3 technical report",
    485       "authors": ["A. Yang"],
    486       "year": 2025,
    487       "arxiv_id": "2505.09388",
    488       "relevance": "LLM model family (Qwen3 4B-32B) used for evaluation in reasoning experiments."
    489     },
    490     {
    491       "title": "Mamba drafters for speculative decoding",
    492       "authors": ["D. Choi", "S. Oh", "S. Dingliwal"],
    493       "year": 2025,
    494       "arxiv_id": "2506.01206",
    495       "relevance": "Alternative approach to speculative decoding using Mamba architecture for draft models."
    496     }
    497   ]
    498 }

Impressum · Datenschutz