scan.json (31117B)
1 { 2 "paper": { 3 "title": "An efficient strategy for fine-tuning large language models", 4 "authors": [ 5 "Benjamin Marsh", 6 "Adam Michaleas", 7 "Darrell O. Ricke", 8 "Shaun Monera", 9 "Shriya Zembruski" 10 ], 11 "year": 2026, 12 "venue": "Frontiers in Artificial Intelligence", 13 "doi": "10.3389/frai.2026.1665992" 14 }, 15 "scan_version": 2, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "DSS combined with full-precision fine-tuning yields the strongest performance for NL-to-Query DSL translation across FLAN-T5 model sizes, but at higher GPU memory cost for smaller models. Under resource constraints, LoRA and QLoRA with a 4:1 alpha-to-rank ratio provide effective performance-efficiency tradeoffs. Counter-intuitively, LoRA and QLoRA used more GPU memory than full-precision for FLAN-T5 Large due to adapter overhead and implementation differences. DSS rationales consistently improved training over label-only supervision across all 8 tested configurations in ablation.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "A public GitHub repository is provided: https://github.com/brmarsh23/An-Efficient-Strategy-for-Fine-Tuning-Large-Language-Models. The paper states 'The code and instructions are available at the following Git Repository link' (Section 1)." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "The Data Availability Statement explicitly states: 'The datasets presented in this article are not readily available because dataset utilized in the submission is Controlled Unclassified Information (CUI) from US Department of Defense computer information systems.'" 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "Section 3.5 describes hardware (Intel Xeon Platinum 8480+, NVIDIA H100 GPUs, 2 TB RAM) and names libraries (PyTorch, PEFT, bitsandbytes, Ray Train) but provides no library versions, requirements.txt, or Dockerfile in the paper itself." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper itself contains no step-by-step reproduction instructions. A GitHub URL is provided with claimed 'instructions,' but the dataset is CUI-restricted, making full reproduction impossible regardless of code availability." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "Tables 3 and 4 report point estimates of evaluation loss with no confidence intervals, error bars, or uncertainty measures. The ablation (Table 4) is averaged over 2 seeds but reports no CI or spread." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper claims full-precision 'yields the strongest overall performance' and DSS 'consistently improves model fine-tuning' but provides no statistical significance tests (no p-values, t-tests, or bootstrap tests) to support any comparative claims." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "Table 4 shows raw loss differences between DSS and label-only training (e.g., +1.4e-3) but no standardized effect sizes (Cohen's d, percentage improvement, or relative differences). Main results in Table 3 are raw loss values with no effect size framing." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "The dataset consists of 1,000 questions with no justification for this sample size, no power analysis, and no discussion of whether 1,000 examples is sufficient for the claims being made." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "Main results (Table 3) are single-run best evaluation losses. The ablation (Table 4) is 'averaged over two random seeds' but reports no standard deviation, IQR, or spread measure. The reader cannot assess result stability." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Three fine-tuning methods are systematically compared: full-precision, LoRA, and QLoRA. The ablation additionally compares DSS (α=0.5) against label-only (α=1.0) training." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "LoRA (Hu et al., 2021) and QLoRA (Dettmers et al., 2023) are the current standard parameter-efficient fine-tuning methods. DSS (Hsieh et al., 2023) is recent. All are appropriate contemporary methods." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "Section 3.7 describes a dedicated ablation study comparing DSS rationale-augmented training (α=0.5) versus label-only training (α=1.0) 'across model sizes and fine-tuning methods.' Results in Table 4." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper reports evaluation loss for task performance, plus GPU memory usage, training samples per second, and total training time as efficiency metrics (Figure 6, Table 3). Multiple dimensions of comparison are provided." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "No human evaluation of model outputs is performed. The paper fine-tunes models for NL-to-Query DSL translation but never has humans assess whether the generated DSL queries are correct, usable, or semantically appropriate." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": false, 98 "justification": "The 80/20 split produces an 'evaluation dataset' that is used both for learning rate reduction decisions ('evaluation loss was monitored in order to conduct learning rate reduction after 10 epochs of no improvement,' Section 3.5) and for final performance reporting. This makes it a validation set, not a held-out test set." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Results are broken down by model architecture (Small/Base/Large/XL), fine-tuning method (full-precision/LoRA/QLoRA), and hyperparameter settings (Rank/Alpha combinations). Figure 6 provides a quad chart; Table 3 lists top 8 models; Figure 7 breaks down by Rank/Alpha." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": false, 108 "justification": "No qualitative error analysis of model outputs is provided. The paper does not show examples where the fine-tuned model produced incorrect Query DSL or discuss specific failure modes of the generated outputs." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper reports the counter-intuitive finding that 'LoRA and QLoRA methods required significantly more memory to train the highest-performing model type, the FLAN-T5 Large, than the full-precision method' (Section 4), contradicting theoretical expectations." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Abstract claims are supported: DSS + full-precision strongest (Table 3, loss 0.06384), LoRA provides performance-efficiency tradeoff (Table 3, Section 5.1), QLoRA enables larger models under memory constraints (only method to run FLAN-T5 XL), 4:1 alpha-to-rank ratio (Figure 7, Section 5.2)." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "The primary causal claim — that DSS rationales improve training — is supported by a controlled ablation (Section 3.7, Table 4) that manipulates only α (0.5 vs 1.0) while holding all other hyperparameters constant. This single-variable manipulation is adequate for causal inference within the tested setting." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The title claims 'fine-tuning large language models' broadly, and the abstract proposes 'a general guide for efficiently fine-tuning LLMs for domain-specific tasks.' However, the study tests only one task (NL to Query DSL) with one model family (FLAN-T5 encoder-decoder). The Limitations section acknowledges this but the title and abstract still overclaim." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": true, 135 "justification": "Section 4 discusses alternative explanations for the counter-intuitive memory findings: additional adapter overhead, dequantization costs during forward/backward passes, and 'implementation differences between the full-precision and LoRA/QLoRA methods' (using standard PyTorch vs PEFT/bitsandbytes libraries)." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": true, 140 "justification": "Section 5.5 explicitly acknowledges: 'the metrics do not directly capture task-level correctness, such as exact match rates on the DSL JSON' and recommends 'future work should incorporate additional metrics, such as BLEU, METEOR, and TER scores.' This directly addresses the gap between measured proxy (token-level loss) and claimed outcome (task effectiveness)." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": true, 147 "justification": "Specific model architectures are identified: FLAN-T5 Small (76.9M), Base (247.6M), Large (770.6M), and XL (2,884.5M) with detailed architecture parameters in Table 1. Teacher model specified as Mixtral 8x22B. These are well-defined open-source models." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": true, 152 "justification": "Figure 2 shows an example input prompt with its three-part structure: DSL interface instructions, dataset description, and Chain-of-Thought prompting. Figure 3 shows the dataset creation process. Figure 4 shows the training step format with task prefixes." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": true, 157 "justification": "Table 2 reports hyperparameters: learning rate (5e-5), patience (10 epochs), LR factor (1e-1), epochs (100), batch size (8), alpha (0.5). Section 3.6 details LoRA/QLoRA-specific parameters: Rank values (32, 64, 128), Alpha values, target modules, and dropout rate." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "No agentic scaffolding is used. The work involves standard model fine-tuning with DSS, not agentic workflows." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Sections 3.1-3.2 describe the data pipeline: 1,000 NL questions processed through Mixtral 8x22B with chain-of-thought prompting to generate labels and rationales, then formatted as multi-task training data with task prefixes, and split 80/20 for training and evaluation." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 5.5 is titled 'Limitations' and contains two substantial paragraphs discussing multiple specific limitations of the study." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 5.5 lists specific threats: single downstream task (NL to Query DSL), token-level loss without task-level metrics, FLAN-T5 only without decoder-only architectures, limited random seeds, and incomplete hyperparameter exploration due to compute constraints." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section 5.5 explicitly states: 'conclusions may not directly transfer to other domains, such as open-ended text generation, conversational dialogue, or classification tasks,' and 'the methodology focuses on the FLAN-T5 encoder-decoder family and does not include decoder-only architectures.'" 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "The dataset is classified as Controlled Unclassified Information (CUI) and 'not readily available.' Requests must be directed to benjamin.marsh@usmc.mil. Independent verification of the underlying data is not possible." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 3.2 describes: 1,000 NL questions designed for querying organizational data, translated to Query DSL using Mixtral 8x22B as teacher model via chain-of-thought prompting. The prompt structure is shown in Figure 2." 197 }, 198 "recruitment_methods_described": { 199 "applies": true, 200 "answer": false, 201 "justification": "The origin of the 1,000 input natural language questions is never described. Were they hand-crafted, sampled from query logs, or synthetically generated? The paper jumps from describing the task to describing the teacher model output without explaining where the input questions came from." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The pipeline from input questions through teacher-model generation to fine-tuning format is documented across Sections 3.1-3.2 and Figures 3-4. The paper mentions 'all training examples were screened for correctness and validity' (Section 5.5), though screening criteria are not detailed." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Funding section states: 'This material is based upon work supported by the Department of the Air Force under Air Force Contract No. FA8702-15-D-0001.'" 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly stated: Marine Corps Tactical Systems Support Activity (USMC) and MIT Lincoln Laboratory, Artificial Intelligence Technology. No conflict with evaluated products exists." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": true, 223 "justification": "The funder (US Department of the Air Force) has no commercial interest in any of the evaluated methods or models (FLAN-T5, LoRA, QLoRA are open-source methods from academic/industry research). The funder benefits from knowing which method works, not from a specific outcome." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": true, 228 "justification": "The Conflict of Interest section states: 'The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.'" 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": false, 234 "answer": false, 235 "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. Models are fine-tuned on a custom dataset (NL to Query DSL) created by the authors; contamination from pre-training data is not a relevant concern for this evaluation setup." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Same rationale: the evaluation dataset was generated by the authors specifically for this study, not drawn from a public benchmark. There is no pre-trained model benchmark evaluation scenario." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": false, 244 "answer": false, 245 "justification": "No public benchmark is used. The evaluation is on a custom CUI dataset, so standard benchmark contamination concerns do not apply." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. This is a model fine-tuning study with automated evaluation." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the study." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in the study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in the study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in the study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "No inference cost, latency, or per-example cost is reported. The paper focuses entirely on training costs. How much it costs to run the fine-tuned models at inference time is not discussed." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": true, 294 "justification": "Total compute time is stated: 'The total compute time for the hyperparameter search was 499.6 hours' (Section 3.6). Hardware is detailed: two-node cluster with four NVIDIA H100 80GB GPUs per node. Per-run training times are in Table 3." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "Main hyperparameter search results (Table 3) are single-run best evaluation losses with no seed variation. The ablation (Table 4) averages over only 2 random seeds with no spread reported. The paper acknowledges 'ablation results are averaged over a limited number of random seeds' (Section 5.5)." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": true, 306 "justification": "Section 3.6 explicitly states: '3 runs were performed using full-precision fine-tuning, 39 runs were performed with the LoRA fine-tuning method, and 44 runs with the QLoRA fine-tuning method.' Total of 86 hyperparameter sweeps. Ablation states 'averaged over two random seeds.'" 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": true, 311 "justification": "Comprehensive search budget reported: 86 total sweeps, 499.6 compute hours, with explicit enumeration of Rank values (32, 64, 128), Alpha values per Rank, and per-architecture/per-method run counts (Section 3.6)." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": true, 316 "justification": "Section 3.8 states: 'the best evaluation loss achieved for each hyperparameter sweep was used to perform the final comparison.' Selection criterion is clear and all top 8 configurations are reported in Table 3 rather than just the single best." 317 }, 318 "multiple_comparison_correction": { 319 "applies": true, 320 "answer": false, 321 "justification": "86 hyperparameter configurations are compared with no correction for multiple comparisons. Claims of 'best' and 'consistent' patterns are made without adjusting for the number of comparisons." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The paper does not discuss author-evaluation bias. While they use standard library implementations rather than custom re-implementations, they do not acknowledge that implementation choices (e.g., which PyTorch vs PEFT optimizations are used) could systematically advantage one method." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": true, 331 "justification": "Figure 6 explicitly plots performance (evaluation loss) alongside GPU memory usage, training samples per second, and total training time for each method and model size. Table 3 also pairs loss with GPU usage and training time. This is a central contribution of the paper." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": true, 336 "justification": "Section 5.5 explicitly discusses construct validity: 'the metrics do not directly capture task-level correctness, such as exact match rates on the DSL JSON' and recommends additional metrics (BLEU, METEOR, TER). This directly questions whether the measured metric (token loss) captures what matters (task correctness)." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": false, 340 "answer": false, 341 "justification": "No scaffolding is involved. The study compares fine-tuning methods directly, not scaffold-dependent systems." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the FLAN-T5 pre-training data or Mixtral 8x22B training data could contain Query DSL patterns or similar structured generation examples that would advantage the models on this task." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether the DSS multi-task training setup (where models learn to generate both rationales and labels) could leak information between tasks at evaluation time, or whether the teacher-generated labels could embed patterns from the teacher's training data." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "The 80/20 train-eval split is applied to 1,000 questions without discussion of whether train and eval examples share structural patterns (e.g., similar question templates, overlapping Query DSL structures) that would inflate performance estimates." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": false, 363 "justification": "No leakage detection or prevention method is used. No overlap analysis, deduplication, or independence verification between train and evaluation splits." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "DSS combined with full-precision fine-tuning yields the strongest overall performance across model architectures.", 370 "evidence": "Table 3 shows full-precision FLAN-T5 Large achieves lowest evaluation loss (0.06384), outperforming all LoRA and QLoRA configurations. Figure 6 top-left shows full-precision has lower mean evaluation loss across all architectures.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "A LoRA Alpha-to-Rank ratio of 4:1 provides optimal performance for parameter-efficient fine-tuning.", 375 "evidence": "Figure 7 shows average evaluation loss decreasing with increasing Alpha relative to Rank, with peak performance at 4:1 ratio. Table 3 shows top LoRA model uses Rank 128, Alpha 512 (4:1 ratio).", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "DSS rationales consistently improve model training over label-only supervision across all model sizes and fine-tuning methods.", 380 "evidence": "Table 4 shows all 8 configurations (3 model sizes × 3 methods, minus FLAN-T5 Large full-precision) achieve lower evaluation loss with DSS (α=0.5) than label-only (α=1.0). Differences range from +2.5e-4 to +1.6e-2.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "QLoRA enables fine-tuning of larger models (FLAN-T5 XL) under GPU memory constraints that prevent full-precision and LoRA training.", 385 "evidence": "Section 3.6 states only QLoRA could run FLAN-T5 XL within available GPU memory. Table 3 shows FLAN-T5 XL QLoRA achieving competitive loss (0.06874, rank 3 overall).", 386 "supported": "strong" 387 }, 388 { 389 "claim": "LoRA and QLoRA can require more GPU memory than full-precision fine-tuning at larger model sizes.", 390 "evidence": "Figure 6 bottom-left shows for FLAN-T5 Large, full-precision uses the least average GPU memory, while LoRA and QLoRA use more. Section 4 explains this via adapter overhead, dequantization costs, and implementation differences.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Rationale supervision has the largest impact when model capacity and update precision are most constrained.", 395 "evidence": "Table 4 shows the largest loss improvement from DSS occurs with FLAN-T5 Small QLoRA (+1.6e-2), while the smallest is FLAN-T5 Base LoRA (+2.5e-4). Section 5.3 interprets this as rationales compensating for limited model capacity.", 396 "supported": "moderate" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "Single-task evaluation with broad claims", 402 "detail": "All conclusions are drawn from one translation task (NL to Query DSL) with one model family (FLAN-T5), but the paper's title claims 'fine-tuning large language models' broadly and the abstract proposes 'a general guide.' The generalization gap between what was tested and what is claimed is substantial." 403 }, 404 { 405 "flag": "No task-level correctness evaluation", 406 "detail": "Only token-level cross-entropy loss is reported. No exact match, BLEU, METEOR, or semantic correctness metrics are used. The paper acknowledges this in limitations but draws conclusions about method effectiveness without knowing whether generated Query DSL outputs are actually correct or usable." 407 }, 408 { 409 "flag": "Validation set used as test set", 410 "detail": "The 80/20 evaluation split was used for learning rate scheduling decisions (LR reduction after 10 epochs of no improvement) and simultaneously as the final evaluation metric. This conflates validation and testing, potentially inflating reported performance." 411 }, 412 { 413 "flag": "No error bars or significance tests", 414 "detail": "Main results are single-run best losses. Even the ablation uses only 2 seeds with no spread reported. Comparative claims ('yields the strongest performance,' 'consistently improves') are made without any statistical testing." 415 }, 416 { 417 "flag": "Restricted dataset prevents verification", 418 "detail": "The dataset is classified as CUI and unavailable to the public. Independent researchers cannot verify the data quality, check for issues in teacher-generated labels/rationales, or replicate the experiments." 419 } 420 ], 421 "cited_papers": [ 422 { 423 "title": "Distilling step-by-step! Outperforming larger language models with less training data and smaller model sizes", 424 "authors": ["C.-Y. Hsieh", "C.-L. Li", "C.-K. Yeh", "H. Nakhost", "Y. Fujii", "A. Ratner"], 425 "year": 2023, 426 "arxiv_id": "2305.02301", 427 "doi": "10.18653/v1/2023.findings-acl.507", 428 "relevance": "Core method used in this paper — knowledge distillation with chain-of-thought rationales to train smaller models efficiently." 429 }, 430 { 431 "title": "LoRA: Low-Rank Adaptation of Large Language Models", 432 "authors": ["E. J. Hu", "Y. Shen", "P. Wallis", "Z. Allen-Zhu", "Y. Li", "S. Wang"], 433 "year": 2021, 434 "arxiv_id": "2106.09685", 435 "relevance": "Parameter-efficient fine-tuning method benchmarked in this paper; foundational PEFT technique for LLMs." 436 }, 437 { 438 "title": "QLoRA: Efficient Finetuning of Quantized LLMs", 439 "authors": ["T. Dettmers", "A. Pagnoni", "A. Holtzman", "L. Zettlemoyer"], 440 "year": 2023, 441 "arxiv_id": "2305.14314", 442 "relevance": "Quantized parameter-efficient fine-tuning method benchmarked in this paper; enables training larger models under memory constraints." 443 }, 444 { 445 "title": "Chain-of-thought prompting elicits reasoning in large language models", 446 "authors": ["J. Wei", "X. Wang", "D. Schuurmans", "M. Bosma", "B. Ichter", "F. Xia"], 447 "year": 2023, 448 "arxiv_id": "2201.11903", 449 "relevance": "Prompting methodology used for eliciting rationales from the teacher model in the DSS pipeline." 450 }, 451 { 452 "title": "Parameter-efficient fine-tuning for large models: a comprehensive survey", 453 "authors": ["Z. Han", "C. Gao", "J. Liu", "J. Zhang", "S. Q. Zhang"], 454 "year": 2024, 455 "arxiv_id": "2403.14608", 456 "relevance": "Survey of PEFT methods relevant to understanding the landscape of efficient fine-tuning approaches." 457 }, 458 { 459 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 460 "authors": ["L. Chen", "M. Zaharia", "J. Zou"], 461 "year": 2023, 462 "arxiv_id": "2305.05176", 463 "relevance": "Addresses cost-efficiency of LLM usage, directly relevant to the cost-constrained fine-tuning focus of this paper." 464 }, 465 { 466 "title": "Exploring the limits of transfer learning with a unified text-to-text transformer", 467 "authors": ["C. Raffel", "N. Shazeer", "A. Roberts", "K. Lee", "S. Narang", "M. Matena"], 468 "year": 2023, 469 "arxiv_id": "1910.10683", 470 "relevance": "T5 architecture foundation paper — the model family (FLAN-T5) evaluated in this work." 471 }, 472 { 473 "title": "Finetuned language models are zero-shot learners", 474 "authors": ["J. Wei", "M. Bosma", "V. Zhao", "K. Guu", "A. W. Yu", "B. Lester"], 475 "year": 2022, 476 "relevance": "Instruction tuning methodology that produced the FLAN-T5 models used in this study." 477 }, 478 { 479 "title": "Capabilities of GPT-4 on medical challenge problems", 480 "authors": ["H. Nori", "N. King", "S. M. McKinney", "D. Carignan", "E. Horvitz"], 481 "year": 2023, 482 "arxiv_id": "2303.13375", 483 "relevance": "Demonstrates domain-specific LLM evaluation challenges, motivating the need for fine-tuning approaches like those studied here." 484 }, 485 { 486 "title": "Optimizing large language models with an enhanced LoRA fine-tuning algorithm for efficiency and robustness in NLP tasks", 487 "authors": ["J. Hu", "X. Liao", "J. Gao", "Z. Qi", "H. Zheng", "C. Wang"], 488 "year": 2024, 489 "arxiv_id": "2412.18729", 490 "relevance": "Recent work on enhanced LoRA fine-tuning, directly related to the parameter-efficient methods evaluated here." 491 } 492 ] 493 }